implement custom directive parser

This commit is contained in:
Steven G. Johnson 2017-08-03 09:40:14 -04:00
parent 9c8c04e950
commit ab60bb59d8
2 changed files with 205 additions and 72 deletions

View File

@ -2,46 +2,180 @@
#
# The basic idea is that we search for `\foo{arg}`, `{\foo arg}`,
# or `{\foo{arg}}`, and look up `foo` in a dictionary of substitutions
# like `textit` -> `*#1*` where #1 is where the (first) argument is
# like `\textit` -> `*#1*` where #1 is where the (first) argument is
# substituted. Then we have separate dictionary entries for text/plain,
# text/markdown, etcetera.
# regex matching (directive,arg)
const latex_directive = r"\\(\W|[A-Za-z]+) *\{([^}]*)\}|\{ *\\(\W|[A-Za-z]+) *(\{([^}]*)\}|[^}]*)\}|\\(\W|[A-Za-z]+) *(\w*)"
###########################################################################
# parsing LaTeX directives:
# given a match m to latex_directive, return (directive,arg)
function extract_directive(m::RegexMatch)
m.captures[1] !== nothing && return (m.captures[1], m.captures[2])
m.captures[3] !== nothing && return (m.captures[3], m.captures[5] === nothing ? m.captures[4] : m.captures[5])
m.captures[6] !== nothing && return (m.captures[6], m.captures[7])
throw(ArgumentError("unknown latex_directive match"))
const BACKSLASH = UInt8('\\')
const BRACEOPEN = UInt8('{')
const BRACECLOSE = UInt8('}')
const SPACE = UInt8(' ')
const DOLLAR = UInt8('$')
isalpha8(x::UInt8) = UInt8('a') x UInt8('z') || UInt8('A') x UInt8('Z')
isalnum8(x::UInt8) = UInt8('0') x UInt8('9') || isalpha8(x)
"""
search_latexdirective(string, istart=1, inbrace=false)
Search for a LaTeX directive \\directive{arg} or similar in `string`, returning
`(ds, de, ae)` such that `string[ds:de]` gives `\\directive` and `string[de+1:ae]`
gives `{arg}`. Use [`striparg`](@ref) to remove surrounding braces and whitespace
from the `arg`.
"""
function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1)
e = sizeof(s)
0 < istart e || return 0,0,0
p = pointer(s)
i = istart
allspaces=true
# find \foo directive or {...}:
while i e
c = unsafe_load(p, i)
(c == BACKSLASH || c == BRACEOPEN) && break
c != SPACE && (allspaces = false)
i += 1
end
if i e && unsafe_load(p, i) == BACKSLASH
directive_start = i
i += 2
i-1 > e && return 0,0,0
if isalpha8(unsafe_load(p, i-1))
while i e && isalpha8(unsafe_load(p, i))
i += 1
end
end
directive_end = i-1
# look for optional opening brace
while i e && unsafe_load(p, i) == SPACE
i += 1
end
i > e && return directive_start, directive_end, e
inbrace = unsafe_load(p, i) == BRACEOPEN
if !inbrace
# search backwards from \foo to look for { \foo ...}
j = directive_start - 1
while j istart && unsafe_load(p, j) == SPACE
j -= 1
end
if j < istart || unsafe_load(p, j) != BRACEOPEN
# argument is not in braces … get next token
if allspaces
# if `\directive ...` was preceded only
# by whitespace, then assume arguments
# extend to the end of the string. This
# happens when we recurse on `{\directive ...}`.
return directive_start, directive_end, e
end
while i e && isalnum8(unsafe_load(p, i))
i += 1
end
return directive_start, directive_end, i-1
end
end
i += 1
elseif i > e
return 0,0,0
else # { ... }
directive_start = i
directive_end = i - 1
inbrace = true
i += 1
end
# search for end of argument (closing brace)
nbraces = 1
while i e
c = unsafe_load(p, i)
if c == BRACEOPEN
nbraces += 1
elseif c == BRACECLOSE
nbraces -= 1
if nbraces == 0
return directive_start, directive_end, inbrace ? i : i-1
end
end
i += 1
end
return directive_start, directive_end, e
end
"""
striparg(s, argstart=start(s), argend=endof(s))
Return the substring of `s` corresponding to the argument from `argstart:argend`, stripping
leading/trailing whitespace and braces.
"""
function striparg(s::Union{String,SubString{String}}, argstart::Int=start(s), argend::Int=endof(s))
e = endof(s)
(1 argstart e && 1 argend e) || throw(BoundsError())
p = pointer(s)
if unsafe_load(p, argend) == BRACECLOSE
argend -= 1 # omit brace
while argstart argend && unsafe_load(p, argstart) != BRACEOPEN
argstart += 1
end
argstart > argend && error("malformed argument")
argstart += 1 # omit brace
end
while argstart argend && unsafe_load(p, argend) == SPACE
argend -= 1
end
while argstart argend && unsafe_load(p, argstart) == SPACE
argstart += 1
end
return SubString(s, argstart, argend)
end
# to make replace work for LaTeX directives with our
# custom search function, all we need to do is to define
# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, i)
# returns the range of the directive
struct LaTeXDirectiveSearch; end
function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, i::Integer)
ds, de, ae = search_latexdirective(s, i)
return ds < i ? (0:-1) : (ds:ae)
end
###########################################################################
# Unicode substitutions for LaTeX directives
const latex_unicode = Dict{String,String}(
# accent escapes like `\"u` for `ü`, from the list at
# https://en.wikibooks.org/wiki/LaTeX/Special_Characters
# converted to LaTeX characters (mostly combining marks)
"`" => "#1\u0300",
"'" => "#1\u0301",
"^" => "#1\u0302",
"\"" => "#1\u0308",
"H" => "#1\u030b",
"~" => "#1\u0303",
"c" => "#1\u0327",
"k" => "#1\u0328",
"l" => "\u0142#1",
"=" => "#1\u0304",
"b" => "#1\u0331",
"." => "#1\u0307",
"d" => "#1\u0323",
"r" => "#1\u030a",
"u" => "#1\u0306",
"v" => "#1\u030c",
"t" => "#1\u0361", # fixme: u+0361 should go after first char in #1
"o" => "\u00f8#1",
"i" => "\u0131#1",
"j" => "\u0237#1",
"\\`" => "#1\u0300",
"\\'" => "#1\u0301",
"\\^" => "#1\u0302",
"\\\"" => "#1\u0308",
"\\H" => "#1\u030b",
"\\~" => "#1\u0303",
"\\c" => "#1\u0327",
"\\k" => "#1\u0328",
"\\l" => "\u0142",
"\\=" => "#1\u0304",
"\\b" => "#1\u0331",
"\\." => "#1\u0307",
"\\d" => "#1\u0323",
"\\r" => "#1\u030a",
"\\u" => "#1\u0306",
"\\v" => "#1\u030c",
"\\t" => "#1\u0361", # fixme: u+0361 should go after first char in #1
"\\o" => "\u00f8",
"\\i" => "\u0131",
"\\j" => "\u0237",
# other backslash escapes
"\\\\" => "\\",
"\\{" => "{", "\\}" => "}",
"\\%" => "%",
# We parse {....} quoting as an empty directive:
"" => "#1",
# many other substitutions can be found in
# Base.REPLCompletions.latex_symbols
@ -49,60 +183,59 @@ const latex_unicode = Dict{String,String}(
# LaTeX directives converted to Markdown
const md_directives = Dict{String,String}(
"emph" => "_#1_",
"textit" => "_#1_",
"it" => "_#1_",
"textbf" => "**#1**",
"bf" => "**#1**",
"texttt" => "`#1`",
"url" => "[#1](#1)",
"sout" => "~~#1~~",
"st" => "~~#1~~",
"cancel" => "~~#1~~",
"\\emph" => "_#1_",
"\\textit" => "_#1_",
"\\it" => "_#1_",
"\\textbf" => "**#1**",
"\\bf" => "**#1**",
"\\texttt" => "`#1`",
"\\url" => "[#1](#1)",
"\\sout" => "~~#1~~",
"\\st" => "~~#1~~",
"\\cancel" => "~~#1~~",
)
# directives that are stripped when converting
# to text/plain
const text_directives = Dict{String,String}(
"emph" => "#1",
"textit" => "#1",
"it" => "#1",
"textbf" => "#1",
"bf" => "#1",
"texttt" => "#1",
"url" => "#1",
"sout" => "#1",
"st" => "#1",
"cancel" => "#1",
"\\emph" => "#1",
"\\textit" => "#1",
"\\it" => "#1",
"\\textbf" => "#1",
"\\bf" => "#1",
"\\texttt" => "#1",
"\\url" => "#1",
"\\sout" => "#1",
"\\st" => "#1",
"\\cancel" => "#1",
)
# Given a string `s` that matches the latex_directive regex,
# return a new string to replace it with. We perform substitutions
# based on extra_directives as well as on latex_unicode, from above.
# Given a (sub)string `s` that represents a LaTeX directive matched
# by search_latexdirective, performs our Unicode substitutions and
# also any additional substitutions given by extra_directives.
function directive_substitution(s::AbstractString, extra_directives::Associative{String,String})
m = match(latex_directive, s)
m === nothing && return s
directive, arg_ = extract_directive(m)
arg = replace_directives(arg_, extra_directives) # recursively replace in args
if haskey(extra_directives, directive)
return replace(extra_directives[directive], "#1", arg)
elseif haskey(latex_unicode, directive)
return replace(latex_unicode[directive], "#1", arg)
else
bdir = string('\\', directive)
if haskey(Base.REPLCompletions.latex_symbols, bdir)
sym = Base.REPLCompletions.latex_symbols[bdir]
if isempty(sym) || strwidth(sym) > 0
return string(sym, arg)
ds, de = search_latexdirective(s)
ae = endof(s)
directive = SubString(s, ds, de)
for dict in (extra_directives, latex_unicode, Base.REPLCompletions.latex_symbols)
if haskey(dict, directive)
sub = dict[directive]
if contains(sub, "#1")
arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives))
return replace(sub, "#1", arg)
else
return string(arg, sym) # combining character like \hat{x}
arg = replace_directives(SubString(s, de+1, ae), extra_directives)
if strwidth(sub) == 0 # \hat{...} etc: combining chars go after argument
return string(striparg(arg), sub)
else
return string(sub, arg) # don't strip for 0-arg macros
end
end
else
return s # no substitutions found
end
end
return s # ignore unrecognized directives
end
# replace all latex directives in `s` via `directive_substitution`
replace_directives(s::AbstractString, extra_directives::Associative{String,String}) =
replace(s, latex_directive, sub -> directive_substitution(sub, extra_directives))
replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives))

View File

@ -50,6 +50,6 @@ end
import BibTeX: replace_directives, md_directives
@testset "latex" begin
@test replace_directives(raw"foo \emph{bar} {\bf baz} {\^{u}}", md_directives) ==
"foo _bar_ **baz** û"
@test replace_directives(raw"foo \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) ==
"foo α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û"
end