diff --git a/lib/lua_lexer_loose.lua b/lib/lua_lexer_loose.lua index a550b9d..d0c12dd 100644 --- a/lib/lua_lexer_loose.lua +++ b/lib/lua_lexer_loose.lua @@ -2,6 +2,9 @@ lua_lexer_loose.lua. Loose lexing of Lua code. See README. + WARNING: This code is preliminary and may have errors + in its current form. + (c) 2013 David Manura. MIT License. --]] @@ -16,7 +19,7 @@ local function match_string(s, pos) pos = pos + 1 while 1 do pos = s:find("[" .. c .. "\\]", pos) - if not pos then return nil, posa, 'syntax error' end + if not pos then return s:sub(posa), #s + 1 end -- not terminated string if s:sub(pos,pos) == c then local part = s:sub(posa, pos) return part, pos + 1 @@ -28,7 +31,7 @@ local function match_string(s, pos) local sc = s:match("^%[(=*)%[", pos) if sc then local _; _, pos = s:find("%]" .. sc .. "%]", pos) - if not pos then return nil, posa, 'syntax error' end + if not pos then return s:sub(posa), #s + 1 end -- not terminated string local part = s:sub(posa, pos) return part, pos + 1 else @@ -56,40 +59,16 @@ end -- note: matches invalid numbers too local function match_numberlike(s, pos) - local a,b = s:match('^(%.?)([0-9])', pos) - if not a then - return nil -- not number - end - local tok, more - if b == '0' then - tok, more = s:match('^(0[xX][0-9a-fA-F]*)([_g-zG-Z]?)', pos) - if tok then -- hex - if #more == 0 and #tok > 2 then return tok end - end - end - if a == '.' then - tok, more = s:match('^(%.[0-9]+)([a-zA-Z_%.]?)', pos) - else - tok, more = s:match('^([0-9]+%.?[0-9]*)([a-zA-Z_%.]?)', pos) - end - if more ~= '' then - if more == 'e' or more == 'E' then -- exponent - local tok2, bad = s:match('^([eE][+-]?[0-9]+)([_a-zA-Z]?)', pos + #tok) - if tok2 and bad == '' then - return tok..tok2 - else - local tok2 = assert(s:match('^[eE][+-]?[0-9a-zA-Z_]*', pos + #tok)) - return tok..tok2, 'bad number' - end - else - local tok2 = s:match('^[0-9a-zA-Z_%.]*', pos + #tok) - return tok..tok2, 'bad number' - end + local tok = s:match('^0[xX][0-9A-Fa-f]*', pos) + if tok then return tok end + local tok = s:match('^[0-9%.]+', pos) + if tok then + local tok2 = s:match('^[eE][+-]?[0-9]*', pos + #tok) + if tok2 then tok = tok .. tok2 end + return tok end - assert(tok) - return tok + return nil end ---TODO: Lua 5.2 hex float local function newset(s) local t = {} @@ -104,8 +83,8 @@ end local sym = newset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_") local dig = newset('0123456789') -local dig2 = qws[[.0 .1 .2 .3 .4 .5 .6 .7 .8 .9]] local op = newset('=~<>.+-*/%^#=<>;:,.{}[]()') + op['=='] = true op['<='] = true op['>='] = true @@ -117,13 +96,10 @@ local is_keyword = qws[[ in local nil not or repeat return then true until while]] -function M.lex(code, f) - local pos = 1 - local tok = code:match('^#[^\n]*\n?', pos) -- shebang - if tok then - --f('Shebang', tok, 1) - pos = pos + #tok - end +function M.lex(code, f, pos) + local pos = pos or 1 + local tok = code:match('^#![^\n]*\n', pos) -- shebang + if tok then f('Shebang', tok, 1) pos = pos + #tok end while pos <= #code do local p2, n2, n1 = code:match('^%s*()((%S)%S?)', pos) if not p2 then assert(code:sub(pos):match('^%s*$')); break end @@ -144,18 +120,18 @@ function M.lex(code, f) f('Comment', tok, pos) pos = pos2 elseif n1 == '\'' or n1 == '\"' or n2 == '[[' or n2 == '[=' then - local tok, _pos2 = match_string(code, pos) + local tok = match_string(code, pos) if tok then f('String', tok, pos) + pos = pos + #tok else f('Unknown', code:sub(pos), pos) -- unterminated string + pos = #code + 1 end - pos = pos + #tok - elseif dig[n1] or dig2[n2] then - local tok, err = match_numberlike(code, pos) assert(tok) + elseif dig[n1] then + local tok = match_numberlike(code, pos) assert(tok) - if err then f('Unknown', tok) - else f('Number', tok, pos) end + f('Number', tok, pos) pos = pos + #tok elseif op[n2] then if n2 == '..' and code:match('^%.', pos+2) then @@ -178,7 +154,7 @@ end local Stream = {} Stream.__index = Stream -function Stream:next() +function Stream:next(val) if self._next then local _next = self._next self._next = nil @@ -198,17 +174,15 @@ function Stream:peek() end end -function M.lexc(code, f) +function M.lexc(code, f, pos) local yield = coroutine.yield - local f = coroutine.wrap(function() - M.lex(code, function(tag, name, pos) --print(tag, '['..name..']') - if tag ~= 'Comment' then - yield {tag=tag, name, lineinfo=pos} - end - end) + local func = coroutine.wrap(f or function() + M.lex(code, function(tag, name, pos) + yield {tag=tag, name, lineinfo=pos} + end, pos) yield {tag='Eof'} end) - return setmetatable({f=f}, Stream) + return setmetatable({f=func}, Stream) end return M diff --git a/lib/lua_parser_loose.lua b/lib/lua_parser_loose.lua index 9bc07d7..38d0f2c 100644 --- a/lib/lua_parser_loose.lua +++ b/lib/lua_parser_loose.lua @@ -6,10 +6,8 @@ local PARSE = {} +local unpack = table.unpack or unpack local LEX = require 'lua_lexer_loose' ---local LEX = require 'lua_lexer_loose_metalua' - - local function warn(message, position) io.stderr:write('WARNING: ', tostring(position), ': ', message, '\n') @@ -30,50 +28,68 @@ end 'VarInside', name, lineinfo - variable definition that comes into scope inside following block. Used for control variables in 'for' statements. 'Id', name, lineinfo - reference to variable. - 'String', name - string or table field - 'Scope', opt - beginning of scope block - 'Endscope', nil, lineinfo - end of scope block + 'String', name - string or table field. + 'Scope', opt - beginning of scope block. + 'EndScope', nil, lineinfo - end of scope block. + 'FunctionCall', name, lineinfo - function call (in addition to other events). + 'Function', name, lineinfo - function definition. --]] -function PARSE.parse_scope(lx, f) +function PARSE.parse_scope(lx, f, level) local cprev = {tag='Eof'} -- stack of scopes. local scopes = {{}} + for l = 2, (level or 1) do scopes[l] = {} end - local function scope_begin(opt) + local function scope_begin(opt, lineinfo, nobreak) scopes[#scopes+1] = {} - f('Scope', opt) + f('Scope', opt, lineinfo, nobreak) end - local function scope_end(lineinfo) - if #scopes <= 1 then + local function scope_end(opt, lineinfo) + local scope = #scopes + if scope <= 1 then warn("'end' without opening block", lineinfo) else table.remove(scopes) end - f('Endscope', nil, lineinfo) + local inside_local = false + for scope = scope-1, 1, -1 do + if scopes[scope].inside_local then inside_local = true; break end + end + f('EndScope', opt, lineinfo, inside_local) end - local function parse_function_list(has_self) + local function parse_function_list(has_self, name, pos) local c = lx:next(); assert(c[1] == '(') + f('Statement', c[1], c.lineinfo, true) -- generate Statement for function definition + scope_begin(c[1], c.lineinfo, true) + + local vars = {} -- accumulate vars (if any) to send after 'Function' if has_self then - local lineinfo = {c.lineinfo+1} -- zero size - f('VarSelf', 'self', lineinfo) + local lineinfo = c.lineinfo+1 -- zero size + table.insert(vars, {'VarSelf', 'self', lineinfo, true}) + end + while true do + local n = lx:peek() + if not (n.tag == 'Id' or n.tag == 'Keyword' and n[1] == '...') then break end + local c = lx:next() + if c.tag == 'Id' then table.insert(vars, {'Var', c[1], c.lineinfo, true}) end + -- ignore '...' in this case + if lx:peek()[1] == ',' then lx:next() end end - c = lx:next() - while c.tag == 'Id' do - f('Var', c[1], c.lineinfo) - c= lx:next() - if c[1] == ',' then c = lx:next() end + if lx:peek()[1] == ')' then + lx:next() + f('Function', name, pos or c.lineinfo, true) end + for _, var in ipairs(vars) do f(unpack(var)) end end - while 1 do + while true do local c = lx:next() - if c.tag == 'Eof' then break end - --print('DEBUG', c.lineinfo) -- Detect end of previous statement - if c.tag == 'Keyword' and ( + if c.tag == 'Eof' -- trigger 'Statement' at the end of file + or c.tag == 'Keyword' and ( c[1] == 'break' or c[1] == 'goto' or c[1] == 'do' or c[1] == 'while' or c[1] == 'repeat' or c[1] == 'if' or c[1] == 'for' or c[1] == 'function' and lx:peek().tag == 'Id' or c[1] == 'local' or c[1] == ';' or c[1] == 'until' or c[1] == 'return' or c[1] == 'end') or @@ -86,63 +102,73 @@ function PARSE.parse_scope(lx, f) cprev[1] == 'nil') or cprev.tag == 'Number' or cprev.tag == 'String') then - if scopes[#scopes].inside_until then scope_end(c.lineinfo) end - f('Statement') + if scopes[#scopes].inside_until then scope_end(nil, c.lineinfo) end + local scope = #scopes + if not scopes[scope].inside_table then scopes[scope].inside_local = nil end + f('Statement', c[1], c.lineinfo, + scopes[scope].inside_local or c[1] == 'local' or c[1] == 'function' or c[1] == 'end') end + + if c.tag == 'Eof' then break end - -- Process token(s). + -- Process token(s) if c.tag == 'Keyword' then if c[1] == 'local' and lx:peek().tag == 'Keyword' and lx:peek()[1] == 'function' then -- local function - c = lx:next(); assert(c[1] == 'function') - c = lx:next() - f('Var', c[1], c.lineinfo) - scope_begin() - parse_function_list() + local c = lx:next(); assert(c[1] == 'function') + if lx:peek().tag == 'Id' then + c = lx:next() + f('Var', c[1], c.lineinfo, true) + if lx:peek()[1] == '(' then parse_function_list(nil, c[1], c.lineinfo) end + end elseif c[1] == 'function' then if lx:peek()[1] == '(' then -- inline function - scope_begin() parse_function_list() - else -- function definition statement + elseif lx:peek().tag == 'Id' then -- function definition statement c = lx:next(); assert(c.tag == 'Id') - f('Id', c[1], c.lineinfo) + local name = c[1] + local pos = c.lineinfo + f('Id', name, pos, true) local has_self - while lx:peek()[1] ~= '(' do + while lx:peek()[1] ~= '(' and lx:peek().tag ~= 'Eof' do c = lx:next() + name = name .. c[1] if c.tag == 'Id' then - f('String', c[1]) + f('String', c[1], c.lineinfo, true) elseif c.tag == 'Keyword' and c[1] == ':' then has_self = true end end - scope_begin() - parse_function_list(has_self) + if lx:peek()[1] == '(' then parse_function_list(has_self, name, pos) end end - elseif c[1] == 'local' then + elseif c[1] == 'local' and lx:peek().tag == 'Id' then + scopes[#scopes].inside_local = true c = lx:next() - f('VarNext', c[1], c.lineinfo) + f('VarNext', c[1], c.lineinfo, true) while lx:peek().tag == 'Keyword' and lx:peek()[1] == ',' do - c = lx:next(); c = lx:next() - f('VarNext', c[1], c.lineinfo) + c = lx:next(); if lx:peek().tag ~= 'Id' then break end + c = lx:next() + f('VarNext', c[1], c.lineinfo, true) end - elseif c[1] == 'for' then + elseif c[1] == 'for' and lx:peek().tag == 'Id' then c = lx:next() - f('VarInside', c[1], c.lineinfo) + f('VarInside', c[1], c.lineinfo, true) while lx:peek().tag == 'Keyword' and lx:peek()[1] == ',' do - c = lx:next(); c = lx:next() - f('VarInside', c[1], c.lineinfo) + c = lx:next(); if lx:peek().tag ~= 'Id' then break end + c = lx:next() + f('VarInside', c[1], c.lineinfo, true) end elseif c[1] == 'do' then - scope_begin('do') + scope_begin('do', c.lineinfo) -- note: do/while/for statement scopes all begin at 'do'. elseif c[1] == 'repeat' or c[1] == 'then' then - scope_begin() + scope_begin(c[1], c.lineinfo) elseif c[1] == 'end' or c[1] == 'elseif' then - scope_end(c.lineinfo) + scope_end(c[1], c.lineinfo) elseif c[1] == 'else' then - scope_end(c.lineinfo) - scope_begin() + scope_end(nil, c.lineinfo) + scope_begin(c[1], c.lineinfo) elseif c[1] == 'until' then scopes[#scopes].inside_until = true elseif c[1] == '{' then @@ -153,17 +179,25 @@ function PARSE.parse_scope(lx, f) scopes[#scopes].inside_table = newval end elseif c.tag == 'Id' then - if scopes[#scopes].inside_table and lx:peek().tag == 'Keyword' and lx:peek()[1] == '=' then + local cnext = lx:peek() + if cnext.tag == 'Keyword' and (cnext[1] == '(' or cnext[1] == '{') + or cnext.tag == 'String' then + f('FunctionCall', c[1], c.lineinfo, scopes[#scopes].inside_local ~= nil) + end + local scope = #scopes + local inside_local = scopes[scope].inside_local ~= nil + if (scopes[scope].inside_table or cprev[1] == ',') + and cnext.tag == 'Keyword' and cnext[1] == '=' then -- table field - f('String', c[1]) + f('String', c[1], c.lineinfo, inside_local) elseif cprev.tag == 'Keyword' and (cprev[1] == ':' or cprev[1] == '.') then - f('String', c[1]) + f('String', c[1], c.lineinfo, inside_local) else - f('Id', c[1], c.lineinfo) + f('Id', c[1], c.lineinfo, inside_local) end end - cprev = c + if c.tag ~= 'Comment' then cprev = c end end end @@ -177,42 +211,50 @@ end 'Id', name, lineinfo, 'local'|'global' (plus all events in parse_scope) --]] -function PARSE.parse_scope_resolve(lx, f) +function PARSE.parse_scope_resolve(lx, f, vars) local NEXT = {} -- unique key local INSIDE = {} -- unique key - local function newscope(vars, opt) + local function newscope(vars, opt, lineinfo) local newvars = opt=='do' and vars[INSIDE] or {} if newvars == vars[INSIDE] then vars[INSIDE] = false end newvars[INSIDE]=false newvars[NEXT]=false + local level = (vars[0] or 0) + 1 + newvars[0] = level -- keep the current level + newvars[-1] = lineinfo -- keep the start of the scope + newvars[level] = newvars -- reference the current vars table return setmetatable(newvars, {__index=vars}) end - local vars = {} + vars = vars or newscope({[0] = 0}, nil, 1) vars[NEXT] = false -- vars that come into scope upon next statement vars[INSIDE] = false -- vars that come into scope upon entering block - PARSE.parse_scope(lx, function(op, name, lineinfo) - --print('DEBUG', op, name) - local other + PARSE.parse_scope(lx, function(op, name, lineinfo, nobreak) + -- in some (rare) cases VarNext can follow Statement event (which copies + -- vars[NEXT]). This may cause vars[0] to be `nil`, so default to 1. + local var = op:find("^Var") and + {fpos = lineinfo, at = (vars[0] or 1) + (op == 'VarInside' and 1 or 0), + masked = vars[name], self = (op == 'VarSelf') or nil } or nil if op == 'Var' or op == 'VarSelf' then - vars[name] = true + vars[name] = var elseif op == 'VarNext' then - vars[NEXT] = vars[NEXT] or {}; vars[NEXT][name] = true + vars[NEXT] = vars[NEXT] or {} + vars[NEXT][name] = var elseif op == 'VarInside' then - vars[INSIDE] = vars[INSIDE] or {}; vars[INSIDE][name] = true + vars[INSIDE] = vars[INSIDE] or {} + vars[INSIDE][name] = var elseif op == 'Scope' then - vars = newscope(vars, name) - elseif op == 'Endscope' then + vars = newscope(vars, name, lineinfo) + elseif op == 'EndScope' then local mt = getmetatable(vars) if mt == nil then warn("'end' without opening block.", lineinfo) else vars = mt.__index end - elseif op == 'Id' then - if vars[name] then other = 'local' else other = 'global' end - elseif op == 'String' then - -- + elseif op == 'Id' + or op == 'String' or op == 'FunctionCall' or op == 'Function' then + -- Just make callback elseif op == 'Statement' then -- beginning of statement -- Apply vars that come into scope upon beginning of statement. if vars[NEXT] then @@ -223,8 +265,8 @@ function PARSE.parse_scope_resolve(lx, f) else assert(false) end - f(op, name, lineinfo, other) - end) + f(op, name, lineinfo, vars, nobreak) + end, vars[0]) end function PARSE.extract_vars(code, f) @@ -232,23 +274,39 @@ function PARSE.extract_vars(code, f) local char0 = 1 -- next char offset to write local function gen(char1, nextchar0) - if char1 > char0 then f('Other', code:sub(char0, char1-1)) end char0 = nextchar0 end PARSE.parse_scope_resolve(lx, function(op, name, lineinfo, other) - --print(op, name, lineinfo, other) if op == 'Id' then - gen(lineinfo, lineinfo+#name) - f('Id', name, other) + f('Id', name, other, lineinfo) elseif op == 'Var' or op == 'VarNext' or op == 'VarInside' then gen(lineinfo, lineinfo+#name) - f('Var', name) + f('Var', name, "local", lineinfo) end -- ignore 'VarSelf' and others end) gen(#code+1, nil) end +--[[ + Converts 5.2 code to 5.1 style code with explicit _ENV variables. + Example: "function f(_ENV, x) print(x, y)" --> + "function _ENV.f(_ENV, x) _ENV.print(x, _ENV.y) end" + + code - string of Lua code. Assumed to be valid Lua (FIX: 5.1 or 5.2?) + f(s) - call back function to send chunks of Lua code output to. Example: io.stdout. +--]] +function PARSE.replace_env(code, f) + if not f then return PARSE.accumulate(PARSE.replace_env, code) end + PARSE.extract_vars(code, function(op, name, other) + if op == 'Id' then + f(other == 'global' and '_ENV.' .. name or name) + elseif op == 'Var' or op == 'Other' then + f(name) + end + end) +end + -- helper function. Can be passed as argument `f` to functions -- like `replace_env` above to accumulate fragments into a single string. function PARSE.accumulator() diff --git a/misc/lua-parser-loose-scm-1.rockspec b/misc/lua-parser-loose-scm-1.rockspec new file mode 100644 index 0000000..4cd3353 --- /dev/null +++ b/misc/lua-parser-loose-scm-1.rockspec @@ -0,0 +1,48 @@ +package = "lua-parser-loose" +version = "scm-1" + +source = { + url = "git://github.com/davidm/lua-parser-loose.git", +} + +description = { + summary = "loose parsing of Lua code, ignoring syntax errors", + detailed = [[ + Does loose parsing of Lua code. + If the code has syntax errors, the parse does not abort; rather, + some information (e.g. local and global variable scopes) is still inferred. + This may be useful for code interactively typed into a text editor. + + Characteristics of this code: + - Parsing does not construct any AST but rather streams tokens. + It should be memory efficient on large files. + It is also pretty fast. + - Very loose parsing. + Does not abort on broken code. + Scopes of local variables are still resolved even if the code is + not syntactically valid. + - Above characteristics make it suitable for use in a text editor, + where code may be interactively typed. + - Loose parsing makes this code somewhat hard to validate its correctness, + but tests are performed to verify robustness. + - The parsing code is designed so that parts of it may be reused for other + purposes in other projects. + ]], + license = "MIT/X11", + homepage = "https://github.com/davidm/lua-parser-loose" +} + +dependencies = { + "lua >= 5.0", +} + +build = { + type = "none", + install = { + lua = { + ["lua_lexer_loose"] = "lib/lua_lexer_loose.lua", + ["lua_parser_loose"] = "lib/lua_parser_loose.lua" + } + }, + copy_directories = { "example", "test" } +}