Current File : //usr/share/texlive/texmf-dist/tex/luatex/lualibs/lualibs-lpeg.lua |
if not modules then modules = { } end modules ['l-lpeg'] = {
version = 1.001,
comment = "companion to luat-lib.mkiv",
author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
copyright = "PRAGMA ADE / ConTeXt Development Team",
license = "see context related readme files"
}
-- we can get too many captures (e.g. on largexml files) which makes me wonder
-- if P(foo)/"" can't be simplfied to N(foo) i.e. some direct instruction to the
-- lpeg virtual machine to ignore it
-- lpeg 12 vs lpeg 10: slower compilation, similar parsing speed (i need to check
-- if i can use new features like capture / 2 and .B (at first sight the xml
-- parser is some 5% slower)
-- lpeg.P("abc") is faster than lpeg.P("a") * lpeg.P("b") * lpeg.P("c")
-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1)
-- move utf -> l-unicode
-- move string -> l-string or keep it here
-- lpeg.B : backward without consumption
-- lpeg.F = getmetatable(lpeg.P(1)).__len : forward without consumption
lpeg = require("lpeg") -- does lpeg register itself global?
local lpeg = lpeg
-- The latest lpeg doesn't have print any more, and even the new ones are not
-- available by default (only when debug mode is enabled), which is a pitty as
-- as it helps nailing down bottlenecks. Performance seems comparable: some 10%
-- slower pattern compilation, same parsing speed, although,
--
-- local p = lpeg.C(lpeg.P(1)^0 * lpeg.P(-1))
-- local a = string.rep("123",100)
-- lpeg.match(p,a)
--
-- seems slower and is also still suboptimal (i.e. a match that runs from begin
-- to end, one of the cases where string matchers win).
if not lpeg.print then function lpeg.print(...) print(lpeg.pcode(...)) end end
-- tracing (only used when we encounter a problem in integration of lpeg in luatex)
-- some code will move to unicode and string
-- local lpmatch = lpeg.match
-- local lpprint = lpeg.print
-- local lpp = lpeg.P
-- local lpr = lpeg.R
-- local lps = lpeg.S
-- local lpc = lpeg.C
-- local lpb = lpeg.B
-- local lpv = lpeg.V
-- local lpcf = lpeg.Cf
-- local lpcb = lpeg.Cb
-- local lpcg = lpeg.Cg
-- local lpct = lpeg.Ct
-- local lpcs = lpeg.Cs
-- local lpcc = lpeg.Cc
-- local lpcmt = lpeg.Cmt
-- local lpcarg = lpeg.Carg
-- function lpeg.match(l,...) print("LPEG MATCH") lpprint(l) return lpmatch(l,...) end
-- function lpeg.P (l) local p = lpp (l) print("LPEG P =") lpprint(l) return p end
-- function lpeg.R (l) local p = lpr (l) print("LPEG R =") lpprint(l) return p end
-- function lpeg.S (l) local p = lps (l) print("LPEG S =") lpprint(l) return p end
-- function lpeg.C (l) local p = lpc (l) print("LPEG C =") lpprint(l) return p end
-- function lpeg.B (l) local p = lpb (l) print("LPEG B =") lpprint(l) return p end
-- function lpeg.V (l) local p = lpv (l) print("LPEG V =") lpprint(l) return p end
-- function lpeg.Cf (l) local p = lpcf (l) print("LPEG Cf =") lpprint(l) return p end
-- function lpeg.Cb (l) local p = lpcb (l) print("LPEG Cb =") lpprint(l) return p end
-- function lpeg.Cg (l) local p = lpcg (l) print("LPEG Cg =") lpprint(l) return p end
-- function lpeg.Ct (l) local p = lpct (l) print("LPEG Ct =") lpprint(l) return p end
-- function lpeg.Cs (l) local p = lpcs (l) print("LPEG Cs =") lpprint(l) return p end
-- function lpeg.Cc (l) local p = lpcc (l) print("LPEG Cc =") lpprint(l) return p end
-- function lpeg.Cmt (l) local p = lpcmt (l) print("LPEG Cmt =") lpprint(l) return p end
-- function lpeg.Carg (l) local p = lpcarg(l) print("LPEG Carg =") lpprint(l) return p end
local type, next, tostring = type, next, tostring
local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format
----- mod, div = math.mod, math.div
local floor = math.floor
local P, R, S, V, Ct, C, Cs, Cc, Cp, Cmt = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp, lpeg.Cmt
local lpegtype, lpegmatch, lpegprint = lpeg.type, lpeg.match, lpeg.print
-- let's start with an inspector:
if setinspector then
setinspector("lpeg",function(v) if lpegtype(v) then lpegprint(v) return true end end)
end
-- Beware, we predefine a bunch of patterns here and one reason for doing so
-- is that we get consistent behaviour in some of the visualizers.
lpeg.patterns = lpeg.patterns or { } -- so that we can share
local patterns = lpeg.patterns
local anything = P(1)
local endofstring = P(-1)
local alwaysmatched = P(true)
patterns.anything = anything
patterns.endofstring = endofstring
patterns.beginofstring = alwaysmatched
patterns.alwaysmatched = alwaysmatched
local sign = S('+-')
local zero = P('0')
local digit = R('09')
local digits = digit^1
local octdigit = R("07")
local octdigits = octdigit^1
local lowercase = R("az")
local uppercase = R("AZ")
local underscore = P("_")
local hexdigit = digit + lowercase + uppercase
local hexdigits = hexdigit^1
local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
----- newline = crlf + S("\r\n") -- cr + lf
local newline = P("\r") * (P("\n") + P(true)) + P("\n") -- P("\r")^-1 * P("\n")^-1
local escaped = P("\\") * anything
local squote = P("'")
local dquote = P('"')
local space = P(" ")
local period = P(".")
local comma = P(",")
local utfbom_32_be = P('\000\000\254\255') -- 00 00 FE FF
local utfbom_32_le = P('\255\254\000\000') -- FF FE 00 00
local utfbom_16_be = P('\254\255') -- FE FF
local utfbom_16_le = P('\255\254') -- FF FE
local utfbom_8 = P('\239\187\191') -- EF BB BF
local utfbom = utfbom_32_be + utfbom_32_le
+ utfbom_16_be + utfbom_16_le
+ utfbom_8
local utftype = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le * Cc("utf-32-le")
+ utfbom_16_be * Cc("utf-16-be") + utfbom_16_le * Cc("utf-16-le")
+ utfbom_8 * Cc("utf-8") + alwaysmatched * Cc("utf-8") -- assume utf8
local utfstricttype = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le * Cc("utf-32-le")
+ utfbom_16_be * Cc("utf-16-be") + utfbom_16_le * Cc("utf-16-le")
+ utfbom_8 * Cc("utf-8")
local utfoffset = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4)
+ utfbom_16_be * Cc(2) + utfbom_16_le * Cc(2)
+ utfbom_8 * Cc(3) + Cc(0)
local utf8next = R("\128\191")
patterns.utfbom_32_be = utfbom_32_be
patterns.utfbom_32_le = utfbom_32_le
patterns.utfbom_16_be = utfbom_16_be
patterns.utfbom_16_le = utfbom_16_le
patterns.utfbom_8 = utfbom_8
patterns.utf_16_be_nl = P("\000\r\000\n") + P("\000\r") + P("\000\n") -- P("\000\r") * (P("\000\n") + P(true)) + P("\000\n")
patterns.utf_16_le_nl = P("\r\000\n\000") + P("\r\000") + P("\n\000") -- P("\r\000") * (P("\n\000") + P(true)) + P("\n\000")
patterns.utf_32_be_nl = P("\000\000\000\r\000\000\000\n") + P("\000\000\000\r") + P("\000\000\000\n")
patterns.utf_32_le_nl = P("\r\000\000\000\n\000\000\000") + P("\r\000\000\000") + P("\n\000\000\000")
patterns.utf8one = R("\000\127")
patterns.utf8two = R("\194\223") * utf8next
patterns.utf8three = R("\224\239") * utf8next * utf8next
patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next
patterns.utfbom = utfbom
patterns.utftype = utftype
patterns.utfstricttype = utfstricttype
patterns.utfoffset = utfoffset
local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false)
local utf8character = P(1) * R("\128\191")^0 -- unchecked but fast
patterns.utf8 = utf8char
patterns.utf8char = utf8char
patterns.utf8character = utf8character -- this one can be used in most cases so we might use that one
patterns.validutf8 = validutf8char
patterns.validutf8char = validutf8char
local eol = S("\n\r")
local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto)
local whitespace = eol + spacer
local nonspacer = 1 - spacer
local nonwhitespace = 1 - whitespace
patterns.eol = eol
patterns.spacer = spacer
patterns.whitespace = whitespace
patterns.nonspacer = nonspacer
patterns.nonwhitespace = nonwhitespace
local stripper = spacer ^0 * C((spacer ^0 * nonspacer ^1)^0) -- from example by roberto
local fullstripper = whitespace^0 * C((whitespace^0 * nonwhitespace^1)^0)
----- collapser = Cs(spacer^0/"" * ((spacer^1 * endofstring / "") + (spacer^1/" ") + P(1))^0)
local collapser = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0))
local nospacer = Cs((whitespace^1/"" + nonwhitespace^1)^0)
local b_collapser = Cs( whitespace^0 /"" * (nonwhitespace^1 + whitespace^1/" ")^0)
local m_collapser = Cs( (nonwhitespace^1 + whitespace^1/" ")^0)
local e_collapser = Cs((whitespace^1 * endofstring/"" + nonwhitespace^1 + whitespace^1/" ")^0)
local x_collapser = Cs( (nonwhitespace^1 + whitespace^1/"" )^0)
local b_stripper = Cs( spacer^0 /"" * (nonspacer^1 + spacer^1/" ")^0)
local m_stripper = Cs( (nonspacer^1 + spacer^1/" ")^0)
local e_stripper = Cs((spacer^1 * endofstring/"" + nonspacer^1 + spacer^1/" ")^0)
local x_stripper = Cs( (nonspacer^1 + spacer^1/"" )^0)
patterns.stripper = stripper
patterns.fullstripper = fullstripper
patterns.collapser = collapser
patterns.nospacer = nospacer
patterns.b_collapser = b_collapser
patterns.m_collapser = m_collapser
patterns.e_collapser = e_collapser
patterns.x_collapser = x_collapser
patterns.b_stripper = b_stripper
patterns.m_stripper = m_stripper
patterns.e_stripper = e_stripper
patterns.x_stripper = x_stripper
patterns.lowercase = lowercase
patterns.uppercase = uppercase
patterns.letter = patterns.lowercase + patterns.uppercase
patterns.space = space
patterns.tab = P("\t")
patterns.spaceortab = patterns.space + patterns.tab
patterns.newline = newline
patterns.emptyline = newline^1
patterns.equal = P("=")
patterns.comma = comma
patterns.commaspacer = comma * spacer^0
patterns.period = period
patterns.colon = P(":")
patterns.semicolon = P(";")
patterns.underscore = underscore
patterns.escaped = escaped
patterns.squote = squote
patterns.dquote = dquote
patterns.nosquote = (escaped + (1-squote))^0
patterns.nodquote = (escaped + (1-dquote))^0
patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") -- will change to C in the middle
patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") -- will change to C in the middle
patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
patterns.unspacer = ((patterns.spacer^1)/"")^0
patterns.singlequoted = squote * patterns.nosquote * squote
patterns.doublequoted = dquote * patterns.nodquote * dquote
patterns.quoted = patterns.doublequoted + patterns.singlequoted
patterns.digit = digit
patterns.digits = digits
patterns.octdigit = octdigit
patterns.octdigits = octdigits
patterns.hexdigit = hexdigit
patterns.hexdigits = hexdigits
patterns.sign = sign
patterns.cardinal = digits
patterns.integer = sign^-1 * digits
patterns.unsigned = digit^0 * period * digits
patterns.float = sign^-1 * patterns.unsigned
patterns.cunsigned = digit^0 * comma * digits
patterns.cpunsigned = digit^0 * (period + comma) * digits
patterns.cfloat = sign^-1 * patterns.cunsigned
patterns.cpfloat = sign^-1 * patterns.cpunsigned
patterns.number = patterns.float + patterns.integer
patterns.cnumber = patterns.cfloat + patterns.integer
patterns.cpnumber = patterns.cpfloat + patterns.integer
patterns.oct = zero * octdigits -- hm is this ok
patterns.octal = patterns.oct
patterns.HEX = zero * P("X") * (digit+uppercase)^1
patterns.hex = zero * P("x") * (digit+lowercase)^1
patterns.hexadecimal = zero * S("xX") * hexdigits
patterns.hexafloat = sign^-1
* zero * S("xX")
* (hexdigit^0 * period * hexdigits + hexdigits * period * hexdigit^0 + hexdigits)
* (S("pP") * sign^-1 * hexdigits)^-1
patterns.decafloat = sign^-1
* (digit^0 * period * digits + digits * period * digit^0 + digits)
* S("eE") * sign^-1 * digits
patterns.propername = (uppercase + lowercase + underscore) * (uppercase + lowercase + underscore + digit)^0 * endofstring
patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1
patterns.beginline = #(1-newline)
patterns.longtostring = Cs(whitespace^0/"" * ((patterns.quoted + nonwhitespace^1 + whitespace^1/"" * (endofstring + Cc(" ")))^0))
-- local function anywhere(pattern) -- slightly adapted from website
-- return P { P(pattern) + 1 * V(1) }
-- end
local function anywhere(pattern) -- faster
return (1-P(pattern))^0 * P(pattern)
end
lpeg.anywhere = anywhere
function lpeg.instringchecker(p)
p = anywhere(p)
return function(str)
return lpegmatch(p,str) and true or false
end
end
-- function lpeg.splitter(pattern, action)
-- return (((1-P(pattern))^1)/action+1)^0
-- end
-- function lpeg.tsplitter(pattern, action)
-- return Ct((((1-P(pattern))^1)/action+1)^0)
-- end
function lpeg.splitter(pattern, action)
if action then
return (((1-P(pattern))^1)/action+1)^0
else
return (Cs((1-P(pattern))^1)+1)^0
end
end
function lpeg.tsplitter(pattern, action)
if action then
return Ct((((1-P(pattern))^1)/action+1)^0)
else
return Ct((Cs((1-P(pattern))^1)+1)^0)
end
end
-- probleem: separator can be lpeg and that does not hash too well, but
-- it's quite okay as the key is then not garbage collected
local splitters_s, splitters_m, splitters_t = { }, { }, { }
local function splitat(separator,single)
local splitter = (single and splitters_s[separator]) or splitters_m[separator]
if not splitter then
separator = P(separator)
local other = C((1 - separator)^0)
if single then
local any = anything
splitter = other * (separator * C(any^0) + "") -- ?
splitters_s[separator] = splitter
else
splitter = other * (separator * other)^0
splitters_m[separator] = splitter
end
end
return splitter
end
local function tsplitat(separator)
local splitter = splitters_t[separator]
if not splitter then
splitter = Ct(splitat(separator))
splitters_t[separator] = splitter
end
return splitter
end
lpeg.splitat = splitat
lpeg.tsplitat = tsplitat
function string.splitup(str,separator)
if not separator then
separator = ","
end
return lpegmatch(splitters_m[separator] or splitat(separator),str)
end
-- local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more
-- local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more
-- local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps
-- local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps
local cache = { }
function lpeg.split(separator,str)
local c = cache[separator]
if not c then
c = tsplitat(separator)
cache[separator] = c
end
return lpegmatch(c,str)
end
function string.split(str,separator)
if separator then
local c = cache[separator]
if not c then
c = tsplitat(separator)
cache[separator] = c
end
return lpegmatch(c,str)
else
return { str }
end
end
local spacing = patterns.spacer^0 * newline -- sort of strip
local empty = spacing * Cc("")
local nonempty = Cs((1-spacing)^1) * spacing^-1
local content = (empty + nonempty)^1
patterns.textline = content
local linesplitter = tsplitat(newline)
patterns.linesplitter = linesplitter
function string.splitlines(str)
return lpegmatch(linesplitter,str)
end
-- lpeg.splitters = cache -- no longer public
local cache = { }
function lpeg.checkedsplit(separator,str)
local c = cache[separator]
if not c then
separator = P(separator)
local other = C((1 - separator)^1)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
return lpegmatch(c,str)
end
function string.checkedsplit(str,separator)
local c = cache[separator]
if not c then
separator = P(separator)
local other = C((1 - separator)^1)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
return lpegmatch(c,str)
end
-- from roberto's site:
local function f2(s) local c1, c2 = byte(s,1,2) return c1 * 64 + c2 - 12416 end
local function f3(s) local c1, c2, c3 = byte(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end
local function f4(s) local c1, c2, c3, c4 = byte(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end
local utf8byte = patterns.utf8one/byte + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4
patterns.utf8byte = utf8byte
--~ local str = " a b c d "
--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
--~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]")
--~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]")
local cache = { }
function lpeg.stripper(str)
if type(str) == "string" then
local s = cache[str]
if not s then
s = Cs(((S(str)^1)/"" + 1)^0)
cache[str] = s
end
return s
else
return Cs(((str^1)/"" + 1)^0)
end
end
local cache = { }
function lpeg.keeper(str)
if type(str) == "string" then
local s = cache[str]
if not s then
s = Cs((((1-S(str))^1)/"" + 1)^0)
cache[str] = s
end
return s
else
return Cs((((1-str)^1)/"" + 1)^0)
end
end
function lpeg.frontstripper(str) -- or pattern (yet undocumented)
return (P(str) + P(true)) * Cs(anything^0)
end
function lpeg.endstripper(str) -- or pattern (yet undocumented)
return Cs((1 - P(str) * endofstring)^0)
end
-- Just for fun I looked at the used bytecode and
-- p = (p and p + pp) or pp gets one more (testset).
-- todo: cache when string
function lpeg.replacer(one,two,makefunction,isutf) -- in principle we should sort the keys
local pattern
local u = isutf and utf8char or 1
if type(one) == "table" then
local no = #one
local p = P(false)
if no == 0 then
for k, v in next, one do
p = p + P(k) / v
end
pattern = Cs((p + u)^0)
elseif no == 1 then
local o = one[1]
one, two = P(o[1]), o[2]
-- pattern = Cs(((1-one)^1 + one/two)^0)
pattern = Cs((one/two + u)^0)
else
for i=1,no do
local o = one[i]
p = p + P(o[1]) / o[2]
end
pattern = Cs((p + u)^0)
end
else
pattern = Cs((P(one)/(two or "") + u)^0)
end
if makefunction then
return function(str)
return lpegmatch(pattern,str)
end
else
return pattern
end
end
-- local pattern1 = P(1-P(pattern))^0 * P(pattern) : test for not nil
-- local pattern2 = (P(pattern) * Cc(true) + P(1))^0 : test for true (could be faster, but not much)
function lpeg.finder(lst,makefunction,isutf) -- beware: slower than find with 'patternless finds'
local pattern
if type(lst) == "table" then
pattern = P(false)
if #lst == 0 then
for k, v in next, lst do
pattern = pattern + P(k) -- ignore key, so we can use a replacer table
end
else
for i=1,#lst do
pattern = pattern + P(lst[i])
end
end
else
pattern = P(lst)
end
if isutf then
pattern = ((utf8char or 1)-pattern)^0 * pattern
else
pattern = (1-pattern)^0 * pattern
end
if makefunction then
return function(str)
return lpegmatch(pattern,str)
end
else
return pattern
end
end
-- print(lpeg.match(lpeg.replacer("e","a"),"test test"))
-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test"))
-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test"))
local splitters_f, splitters_s = { }, { }
function lpeg.firstofsplit(separator) -- always return value
local splitter = splitters_f[separator]
if not splitter then
local pattern = P(separator)
splitter = C((1 - pattern)^0)
splitters_f[separator] = splitter
end
return splitter
end
function lpeg.secondofsplit(separator) -- nil if not split
local splitter = splitters_s[separator]
if not splitter then
local pattern = P(separator)
splitter = (1 - pattern)^0 * pattern * C(anything^0)
splitters_s[separator] = splitter
end
return splitter
end
local splitters_s, splitters_p = { }, { }
function lpeg.beforesuffix(separator) -- nil if nothing but empty is ok
local splitter = splitters_s[separator]
if not splitter then
local pattern = P(separator)
splitter = C((1 - pattern)^0) * pattern * endofstring
splitters_s[separator] = splitter
end
return splitter
end
function lpeg.afterprefix(separator) -- nil if nothing but empty is ok
local splitter = splitters_p[separator]
if not splitter then
local pattern = P(separator)
splitter = pattern * C(anything^0)
splitters_p[separator] = splitter
end
return splitter
end
function lpeg.balancer(left,right)
left, right = P(left), P(right)
return P { left * ((1 - left - right) + V(1))^0 * right }
end
-- print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de"))
-- print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty
-- print(3,lpegmatch(lpeg.firstofsplit(":"),"bc"))
-- print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de"))
-- print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty
-- print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc"))
-- print(7,lpegmatch(lpeg.secondofsplit(":"),"bc"))
-- print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc"))
-- this was slower but lpeg has been sped up in the meantime, so we no longer
-- use this (still seems somewhat faster on long strings)
--
-- local nany = utf8char/""
--
-- function lpeg.counter(pattern)
-- pattern = Cs((P(pattern)/" " + nany)^0)
-- return function(str)
-- return #lpegmatch(pattern,str)
-- end
-- end
function lpeg.counter(pattern,action)
local n = 0
local pattern = (P(pattern) / function() n = n + 1 end + anything)^0
----- pattern = (P(pattern) * (P(true) / function() n = n + 1 end) + anything)^0
----- pattern = (P(pattern) * P(function() n = n + 1 end) + anything)^0
if action then
return function(str) n = 0 ; lpegmatch(pattern,str) ; action(n) end
else
return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end
end
end
-- lpeg.print(lpeg.R("ab","cd","gh"))
-- lpeg.print(lpeg.P("a","b","c"))
-- lpeg.print(lpeg.S("a","b","c"))
-- print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à")))
-- print(lpeg.count("äáàa",lpeg.UP("áà")))
-- print(lpeg.count("äáàa",lpeg.US("àá")))
-- print(lpeg.count("äáàa",lpeg.UR("aá")))
-- print(lpeg.count("äáàa",lpeg.UR("àá")))
-- print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF)))
function lpeg.is_lpeg(p)
return p and lpegtype(p) == "pattern"
end
function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order
if type(list) ~= "table" then
list = { list, ... }
end
-- table.sort(list) -- longest match first
local p = P(list[1])
for l=2,#list do
p = p + P(list[l])
end
return p
end
-- For the moment here, but it might move to utilities. Beware, we need to
-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we
-- loop back from the end cq. prepend.
local sort = table.sort
local function copyindexed(old)
local new = { }
for i=1,#old do
new[i] = old
end
return new
end
local function sortedkeys(tab)
local keys, s = { }, 0
for key,_ in next, tab do
s = s + 1
keys[s] = key
end
sort(keys)
return keys
end
function lpeg.append(list,pp,delayed,checked)
local p = pp
if #list > 0 then
local keys = copyindexed(list)
sort(keys)
for i=#keys,1,-1 do
local k = keys[i]
if p then
p = P(k) + p
else
p = P(k)
end
end
elseif delayed then -- hm, it looks like the lpeg parser resolves anyway
local keys = sortedkeys(list)
if p then
for i=1,#keys,1 do
local k = keys[i]
local v = list[k]
p = P(k)/list + p
end
else
for i=1,#keys do
local k = keys[i]
local v = list[k]
if p then
p = P(k) + p
else
p = P(k)
end
end
if p then
p = p / list
end
end
elseif checked then
-- problem: substitution gives a capture
local keys = sortedkeys(list)
for i=1,#keys do
local k = keys[i]
local v = list[k]
if p then
if k == v then
p = P(k) + p
else
p = P(k)/v + p
end
else
if k == v then
p = P(k)
else
p = P(k)/v
end
end
end
else
local keys = sortedkeys(list)
for i=1,#keys do
local k = keys[i]
local v = list[k]
if p then
p = P(k)/v + p
else
p = P(k)/v
end
end
end
return p
end
-- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true))
-- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true))
-- function lpeg.exact_match(words,case_insensitive)
-- local pattern = concat(words)
-- if case_insensitive then
-- local pattern = S(upper(characters)) + S(lower(characters))
-- local list = { }
-- for i=1,#words do
-- list[lower(words[i])] = true
-- end
-- return Cmt(pattern^1, function(_,i,s)
-- return list[lower(s)] and i
-- end)
-- else
-- local pattern = S(concat(words))
-- local list = { }
-- for i=1,#words do
-- list[words[i]] = true
-- end
-- return Cmt(pattern^1, function(_,i,s)
-- return list[s] and i
-- end)
-- end
-- end
-- experiment:
local p_false = P(false)
local p_true = P(true)
-- local function collapse(t,x)
-- if type(t) ~= "table" then
-- return t, x
-- else
-- local n = next(t)
-- if n == nil then
-- return t, x
-- elseif next(t,n) == nil then
-- -- one entry
-- local k = n
-- local v = t[k]
-- if type(v) == "table" then
-- return collapse(v,x..k)
-- else
-- return v, x .. k
-- end
-- else
-- local tt = { }
-- for k, v in next, t do
-- local vv, kk = collapse(v,k)
-- tt[kk] = vv
-- end
-- return tt, x
-- end
-- end
-- end
local lower = utf and utf.lower or string.lower
local upper = utf and utf.upper or string.upper
function lpeg.setutfcasers(l,u)
lower = l or lower
upper = u or upper
end
local function make1(t,rest)
local p = p_false
local keys = sortedkeys(t)
for i=1,#keys do
local k = keys[i]
if k ~= "" then
local v = t[k]
if v == true then
p = p + P(k) * p_true
elseif v == false then
-- can't happen
else
p = p + P(k) * make1(v,v[""])
end
end
end
if rest then
p = p + p_true
end
return p
end
local function make2(t,rest) -- only ascii
local p = p_false
local keys = sortedkeys(t)
for i=1,#keys do
local k = keys[i]
if k ~= "" then
local v = t[k]
if v == true then
p = p + (P(lower(k))+P(upper(k))) * p_true
elseif v == false then
-- can't happen
else
p = p + (P(lower(k))+P(upper(k))) * make2(v,v[""])
end
end
end
if rest then
p = p + p_true
end
return p
end
local function utfchartabletopattern(list,insensitive) -- goes to util-lpg
local tree = { }
local n = #list
if n == 0 then
for s in next, list do
local t = tree
local p, pk
for c in gmatch(s,".") do
if t == true then
t = { [c] = true, [""] = true }
p[pk] = t
p = t
t = false
elseif t == false then
t = { [c] = false }
p[pk] = t
p = t
t = false
else
local tc = t[c]
if not tc then
tc = false
t[c] = false
end
p = t
t = tc
end
pk = c
end
if t == false then
p[pk] = true
elseif t == true then
-- okay
else
t[""] = true
end
end
else
for i=1,n do
local s = list[i]
local t = tree
local p, pk
for c in gmatch(s,".") do
if t == true then
t = { [c] = true, [""] = true }
p[pk] = t
p = t
t = false
elseif t == false then
t = { [c] = false }
p[pk] = t
p = t
t = false
else
local tc = t[c]
if not tc then
tc = false
t[c] = false
end
p = t
t = tc
end
pk = c
end
if t == false then
p[pk] = true
elseif t == true then
-- okay
else
t[""] = true
end
end
end
-- collapse(tree,"") -- needs testing, maybe optional, slightly faster because P("x")*P("X") seems slower than P"(xX") (why)
-- inspect(tree)
return (insensitive and make2 or make1)(tree)
end
lpeg.utfchartabletopattern = utfchartabletopattern
function lpeg.utfreplacer(list,insensitive)
local pattern = Cs((utfchartabletopattern(list,insensitive)/list + utf8character)^0)
return function(str)
return lpegmatch(pattern,str) or str
end
end
-- local t = { "start", "stoep", "staart", "paard" }
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
-- local t = { "a", "abc", "ac", "abe", "abxyz", "xy", "bef","aa" }
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
-- inspect(lpegmatch(p,"a")=="A")
-- inspect(lpegmatch(p,"aa")=="AA")
-- inspect(lpegmatch(p,"aaaa")=="AAAA")
-- inspect(lpegmatch(p,"ac")=="AC")
-- inspect(lpegmatch(p,"bc")=="bc")
-- inspect(lpegmatch(p,"zzbczz")=="zzbczz")
-- inspect(lpegmatch(p,"zzabezz")=="zzABEzz")
-- inspect(lpegmatch(p,"ab")=="Ab")
-- inspect(lpegmatch(p,"abc")=="ABC")
-- inspect(lpegmatch(p,"abe")=="ABE")
-- inspect(lpegmatch(p,"xa")=="xA")
-- inspect(lpegmatch(p,"bx")=="bx")
-- inspect(lpegmatch(p,"bax")=="bAx")
-- inspect(lpegmatch(p,"abxyz")=="ABXYZ")
-- inspect(lpegmatch(p,"foobarbefcrap")=="foobArBEFcrAp")
-- local t = { ["^"] = 1, ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
-- local t = { ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
-- lpeg.utfchartabletopattern {
-- utfchar(0x00A0), -- nbsp
-- utfchar(0x2000), -- enquad
-- utfchar(0x2001), -- emquad
-- utfchar(0x2002), -- enspace
-- utfchar(0x2003), -- emspace
-- utfchar(0x2004), -- threeperemspace
-- utfchar(0x2005), -- fourperemspace
-- utfchar(0x2006), -- sixperemspace
-- utfchar(0x2007), -- figurespace
-- utfchar(0x2008), -- punctuationspace
-- utfchar(0x2009), -- breakablethinspace
-- utfchar(0x200A), -- hairspace
-- utfchar(0x200B), -- zerowidthspace
-- utfchar(0x202F), -- narrownobreakspace
-- utfchar(0x205F), -- math thinspace
-- }
-- a few handy ones:
--
-- faster than find(str,"[\n\r]") when match and # > 7 and always faster when # > 3
patterns.containseol = lpeg.finder(eol) -- (1-eol)^0 * eol
-- The next pattern^n variant is based on an approach suggested
-- by Roberto: constructing a big repetition in chunks.
--
-- Being sparse is not needed, and only complicate matters and
-- the number of redundant entries is not that large.
local function nextstep(n,step,result)
local m = n % step -- mod(n,step)
local d = floor(n/step) -- div(n,step)
if d > 0 then
local v = V(tostring(step))
local s = result.start
for i=1,d do
if s then
s = v * s
else
s = v
end
end
result.start = s
end
if step > 1 and result.start then
local v = V(tostring(step/2))
result[tostring(step)] = v * v
end
if step > 0 then
return nextstep(m,step/2,result)
else
return result
end
end
function lpeg.times(pattern,n)
return P(nextstep(n,2^16,{ "start", ["1"] = pattern }))
end
-- local p = lpeg.Cs((1 - lpeg.times(lpeg.P("AB"),25))^1)
-- local s = "12" .. string.rep("AB",20) .. "34" .. string.rep("AB",30) .. "56"
-- inspect(p)
-- print(lpeg.match(p,s))
-- moved here (before util-str)
do
local trailingzeros = zero^0 * -digit -- suggested by Roberto
local stripper = Cs((
digits * (
period * trailingzeros / ""
+ period * (digit - trailingzeros)^1 * (trailingzeros / "")
) + 1
)^0)
lpeg.patterns.stripzeros = stripper -- multiple in string
local nonzero = digit - zero
local trailingzeros = zero^1 * endofstring
local stripper = Cs( (1-period)^0 * (
period * trailingzeros/""
+ period * (nonzero^1 + (trailingzeros/"") + zero^1)^0
+ endofstring
))
lpeg.patterns.stripzero = stripper -- slightly more efficient but expects a float !
-- local sample = "bla 11.00 bla 11 bla 0.1100 bla 1.00100 bla 0.00 bla 0.001 bla 1.1100 bla 0.100100100 bla 0.00100100100"
-- collectgarbage("collect")
-- str = string.rep(sample,10000)
-- local ts = os.clock()
-- lpegmatch(stripper,str)
-- print(#str, os.clock()-ts, lpegmatch(stripper,sample))
end
-- for practical reasons we keep this here:
local byte_to_HEX = { }
local byte_to_hex = { }
local byte_to_dec = { } -- for md5
local hex_to_byte = { }
for i=0,255 do
local H = format("%02X",i)
local h = format("%02x",i)
local d = format("%03i",i)
local c = char(i)
byte_to_HEX[c] = H
byte_to_hex[c] = h
byte_to_dec[c] = d
hex_to_byte[h] = c
hex_to_byte[H] = c
end
local hextobyte = P(2)/hex_to_byte
local bytetoHEX = P(1)/byte_to_HEX
local bytetohex = P(1)/byte_to_hex
local bytetodec = P(1)/byte_to_dec
local hextobytes = Cs(hextobyte^0)
local bytestoHEX = Cs(bytetoHEX^0)
local bytestohex = Cs(bytetohex^0)
local bytestodec = Cs(bytetodec^0)
patterns.hextobyte = hextobyte
patterns.bytetoHEX = bytetoHEX
patterns.bytetohex = bytetohex
patterns.bytetodec = bytetodec
patterns.hextobytes = hextobytes
patterns.bytestoHEX = bytestoHEX
patterns.bytestohex = bytestohex
patterns.bytestodec = bytestodec
function string.toHEX(s)
if not s or s == "" then
return s
else
return lpegmatch(bytestoHEX,s)
end
end
function string.tohex(s)
if not s or s == "" then
return s
else
return lpegmatch(bytestohex,s)
end
end
function string.todec(s)
if not s or s == "" then
return s
else
return lpegmatch(bytestodec,s)
end
end
function string.tobytes(s)
if not s or s == "" then
return s
else
return lpegmatch(hextobytes,s)
end
end
-- local h = "ADFE0345"
-- local b = lpegmatch(patterns.hextobytes,h)
-- print(h,b,string.tohex(b),string.toHEX(b))
local patterns = { } -- can be made weak
local function containsws(what)
local p = patterns[what]
if not p then
local p1 = P(what) * (whitespace + endofstring) * Cc(true)
local p2 = whitespace * P(p1)
p = P(p1) + P(1-p2)^0 * p2 + Cc(false)
patterns[what] = p
end
return p
end
lpeg.containsws = containsws
function string.containsws(str,what)
return lpegmatch(patterns[what] or containsws(what),str)
end