commit
3f6dfb22d1
|
@ -4,5 +4,6 @@ export Bibliography, Citation
|
|||
include("parser.jl")
|
||||
include("citation.jl")
|
||||
include("bibliography.jl")
|
||||
include("latex.jl")
|
||||
|
||||
end
|
||||
|
|
|
@ -0,0 +1,324 @@
|
|||
# conversion of LaTeX directives to plain text, markdown, etc.
|
||||
#
|
||||
# The basic idea is that we search for `\foo{arg}`, `{\foo arg}`,
|
||||
# or `{\foo{arg}}`, and look up `foo` in a dictionary of substitutions
|
||||
# like `\textit` -> `*#1*` where #1 is where the (first) argument is
|
||||
# substituted. Then we have separate dictionary entries for text/plain,
|
||||
# text/markdown, etcetera.
|
||||
|
||||
###########################################################################
|
||||
# parsing LaTeX directives:
|
||||
|
||||
const BACKSLASH = UInt8('\\')
|
||||
const BRACEOPEN = UInt8('{')
|
||||
const BRACECLOSE = UInt8('}')
|
||||
const SPACE = UInt8(' ')
|
||||
const DOLLAR = UInt8('$')
|
||||
const CARET = UInt8('^')
|
||||
const UNDERSCORE = UInt8('_')
|
||||
isalpha8(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z')
|
||||
isalnum8(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || isalpha8(x)
|
||||
|
||||
"""
|
||||
search_latexdirective(string, istart=1, inbrace=false)
|
||||
|
||||
Search for a LaTeX directive \\directive{arg} or similar in `string`, returning
|
||||
`(ds, de, ae)` such that `string[ds:de]` gives `\\directive` and `string[de+1:ae]`
|
||||
gives `{arg}`. Use [`striparg`](@ref) to remove surrounding braces and whitespace
|
||||
from the `arg`.
|
||||
"""
|
||||
function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1)
|
||||
e = sizeof(s)
|
||||
0 < istart ≤ e || return 0,0,0
|
||||
p = Vector{UInt8}(s)
|
||||
i = istart
|
||||
allspaces=true
|
||||
|
||||
# find \foo directive or {...}:
|
||||
c = UInt8(0)
|
||||
while i ≤ e
|
||||
c = p[i]
|
||||
(c == BACKSLASH || c == BRACEOPEN || c == CARET || c == UNDERSCORE) && break
|
||||
c != SPACE && (allspaces = false)
|
||||
i += 1
|
||||
end
|
||||
if i ≤ e && c != BRACEOPEN
|
||||
directive_start = i
|
||||
if c == BACKSLASH
|
||||
i += 2
|
||||
i-1 > e && return 0,0,0
|
||||
if isalpha8(p[i-1])
|
||||
while i ≤ e && isalpha8(p[i])
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
directive_end = i-1
|
||||
else
|
||||
directive_end = directive_start # ^ or _
|
||||
i += 1
|
||||
end
|
||||
|
||||
# look for optional opening brace
|
||||
while i ≤ e && p[i] == SPACE
|
||||
i += 1
|
||||
end
|
||||
i > e && return directive_start, directive_end, e
|
||||
inbrace = p[i] == BRACEOPEN
|
||||
if !inbrace
|
||||
# search backwards from \foo to look for { \foo ...}
|
||||
j = directive_start - 1
|
||||
while j ≥ istart && p[j] == SPACE
|
||||
j -= 1
|
||||
end
|
||||
if j < istart || p[j] != BRACEOPEN
|
||||
if p[i] == BACKSLASH
|
||||
# argument is another latex directive
|
||||
ds,de,ae = search_latexdirective(s, i)
|
||||
return directive_start, directive_end, ae
|
||||
elseif c != BACKSLASH
|
||||
# in an equation, token is a single char
|
||||
return directive_start, directive_end, i
|
||||
elseif allspaces
|
||||
# if `\directive ...` was preceded only
|
||||
# by whitespace, then assume arguments
|
||||
# extend to the end of the string. This
|
||||
# happens when we recurse on `{\directive ...}`.
|
||||
return directive_start, directive_end, e
|
||||
else
|
||||
# argument is not in braces … get next token
|
||||
while i ≤ e && isalnum8(p[i])
|
||||
i += 1
|
||||
end
|
||||
return directive_start, directive_end, i-1
|
||||
end
|
||||
end
|
||||
end
|
||||
i += 1
|
||||
elseif i > e
|
||||
return 0,0,0
|
||||
else # { ... }
|
||||
directive_start = i
|
||||
directive_end = i - 1
|
||||
inbrace = true
|
||||
i += 1
|
||||
end
|
||||
|
||||
# search for end of argument (closing brace)
|
||||
nbraces = 1
|
||||
while i ≤ e
|
||||
c = p[i]
|
||||
if c == BRACEOPEN
|
||||
nbraces += 1
|
||||
elseif c == BRACECLOSE
|
||||
nbraces -= 1
|
||||
if nbraces == 0
|
||||
return directive_start, directive_end, inbrace ? i : i-1
|
||||
end
|
||||
end
|
||||
i += 1
|
||||
end
|
||||
return directive_start, directive_end, e
|
||||
end
|
||||
|
||||
"""
|
||||
striparg(s, argstart=start(s), argend=endof(s))
|
||||
|
||||
Return the substring of `s` corresponding to the argument from `argstart:argend`, stripping
|
||||
leading/trailing whitespace and braces.
|
||||
"""
|
||||
function striparg(s::Union{String,SubString{String}}, argstart::Int=start(s), argend::Int=endof(s))
|
||||
argstart > argend && return SubString(s, 1, 0)
|
||||
e = endof(s)
|
||||
(1 ≤ argstart ≤ e && 1 ≤ argend ≤ e) || throw(BoundsError())
|
||||
|
||||
p = Vector{UInt8}(s)
|
||||
if p[argend] == BRACECLOSE
|
||||
argend -= 1 # omit brace
|
||||
while argstart ≤ argend && p[argstart] != BRACEOPEN
|
||||
argstart += 1
|
||||
end
|
||||
argstart > argend && error("malformed argument")
|
||||
argstart += 1 # omit brace
|
||||
end
|
||||
while argstart ≤ argend && p[argend] == SPACE
|
||||
argend -= 1
|
||||
end
|
||||
while argstart ≤ argend && p[argstart] == SPACE
|
||||
argstart += 1
|
||||
end
|
||||
return SubString(s, argstart, argend)
|
||||
end
|
||||
|
||||
# to make replace work for LaTeX directives with our
|
||||
# custom search function, all we need to do is to define
|
||||
# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, i)
|
||||
# returns the range of the directive
|
||||
struct LaTeXDirectiveSearch; end
|
||||
function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, i::Integer)
|
||||
ds, de, ae = search_latexdirective(s, i)
|
||||
return ds < i ? (0:-1) : (ds:ae)
|
||||
end
|
||||
###########################################################################
|
||||
|
||||
# Unicode substitutions for LaTeX directives
|
||||
const latex_unicode = Dict{String,String}(
|
||||
# accent escapes like `\"u` for `ü`, from the list at
|
||||
# https://en.wikibooks.org/wiki/LaTeX/Special_Characters
|
||||
# converted to LaTeX characters (mostly combining marks)
|
||||
"\\`" => "#1\u0300",
|
||||
"\\'" => "#1\u0301",
|
||||
"\\^" => "#1\u0302",
|
||||
"\\\"" => "#1\u0308",
|
||||
"\\H" => "#1\u030b",
|
||||
"\\~" => "#1\u0303",
|
||||
"\\c" => "#1\u0327",
|
||||
"\\k" => "#1\u0328",
|
||||
"\\l" => "\u0142",
|
||||
"\\=" => "#1\u0304",
|
||||
"\\b" => "#1\u0331",
|
||||
"\\." => "#1\u0307",
|
||||
"\\d" => "#1\u0323",
|
||||
"\\r" => "#1\u030a",
|
||||
"\\u" => "#1\u0306",
|
||||
"\\v" => "#1\u030c",
|
||||
"\\t" => "#1\u0361", # fixme: u+0361 should go after first char in #1
|
||||
"\\o" => "\u00f8",
|
||||
"\\i" => "\u0131",
|
||||
"\\j" => "\u0237",
|
||||
|
||||
# other backslash escapes
|
||||
"\\\\" => "\\",
|
||||
"\\{" => "{", "\\}" => "}",
|
||||
"\\%" => "%",
|
||||
# "\\\$" => "\$" -- dollar signs will be unescaped in strip_dollars
|
||||
|
||||
# We parse {....} quoting as an empty directive:
|
||||
"" => "#1",
|
||||
|
||||
# many other substitutions can be found in
|
||||
# Base.REPLCompletions.latex_symbols
|
||||
)
|
||||
|
||||
# LaTeX directives converted to Markdown
|
||||
const md_directives = Dict{String,String}(
|
||||
"\\emph" => "_#1_",
|
||||
"\\textit" => "_#1_",
|
||||
"\\it" => "_#1_",
|
||||
"\\mathit" => "_#1_",
|
||||
"\\textbf" => "**#1**",
|
||||
"\\bf" => "**#1**",
|
||||
"\\mathbf" => "**#1**",
|
||||
"\\texttt" => "`#1`",
|
||||
"\\mathrm" => "#1",
|
||||
"\\url" => "[#1](#1)",
|
||||
"\\sout" => "~~#1~~",
|
||||
"\\st" => "~~#1~~",
|
||||
"\\cancel" => "~~#1~~",
|
||||
)
|
||||
|
||||
# directives that are stripped when converting
|
||||
# to text/plain
|
||||
const text_directives = Dict{String,String}(
|
||||
"\\emph" => "#1",
|
||||
"\\textit" => "#1",
|
||||
"\\it" => "#1",
|
||||
"\\mathit" => "#1",
|
||||
"\\textbf" => "#1",
|
||||
"\\bf" => "#1",
|
||||
"\\mathbf" => "#1",
|
||||
"\\texttt" => "#1",
|
||||
"\\mathrm" => "#1",
|
||||
"\\url" => "#1",
|
||||
)
|
||||
|
||||
# Unicode includes an incomplete set of super/subscript characters:
|
||||
const superscripts = Dict{Char,Char}(
|
||||
'0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹',
|
||||
'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ',
|
||||
'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ',
|
||||
'r'=>'ʳ', 's'=>'ˢ', 't'=>'ᵗ', 'u'=>'ᵘ', 'v'=>'ᵛ', 'w'=>'ʷ', 'x'=>'ˣ', 'y'=>'ʸ', 'z'=>'ᶻ',
|
||||
'A'=>'ᴬ', 'B'=>'ᴮ', 'C'=>'ᶜ', 'D'=>'ᴰ', 'E'=>'ᴱ', 'G'=>'ᴳ', 'H'=>'ᴴ', 'I'=>'ᴵ', 'J'=>'ᴶ',
|
||||
'K'=>'ᴷ', 'L'=>'ᴸ', 'M'=>'ᴹ', 'N'=>'ᴺ', 'O'=>'ᴼ', 'P'=>'ᴾ', 'R'=>'ᴿ', 'S'=>'ˢ', 'T'=>'ᵀ',
|
||||
'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ',
|
||||
'+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', '∘'=>'°',
|
||||
)
|
||||
const subscripts = Dict{Char,Char}(
|
||||
'0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉',
|
||||
'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ',
|
||||
'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ',
|
||||
'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ',
|
||||
'-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ',
|
||||
)
|
||||
function replacechars(s::AbstractString, charmap::Associative{Char,Char})
|
||||
buf = IOBuffer()
|
||||
for c in s
|
||||
cm = get(charmap, c, '\0')
|
||||
cm == '\0' && return ""
|
||||
print(buf, cm)
|
||||
end
|
||||
return String(take!(buf))
|
||||
end
|
||||
|
||||
# Given a (sub)string `s` that represents a LaTeX directive matched
|
||||
# by search_latexdirective, performs our Unicode substitutions and
|
||||
# also any additional substitutions given by extra_directives.
|
||||
function directive_substitution(s::AbstractString, extra_directives::Associative{String,String})
|
||||
ds, de = search_latexdirective(s)
|
||||
ae = endof(s)
|
||||
directive = SubString(s, ds, de)
|
||||
for dict in (extra_directives, latex_unicode, Base.REPLCompletions.latex_symbols)
|
||||
if haskey(dict, directive)
|
||||
sub = dict[directive]
|
||||
if contains(sub, "#1")
|
||||
arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives))
|
||||
return replace(sub, "#1", arg)
|
||||
else
|
||||
arg = replace_directives(SubString(s, de+1, ae), extra_directives)
|
||||
if strwidth(sub) == 0 # \hat{...} etc: combining chars go after argument
|
||||
return string(striparg(arg), sub)
|
||||
else
|
||||
return string(sub, arg) # don't strip for 0-arg macros
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if directive == "^" || directive == "_" # super/subscripts
|
||||
arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives))
|
||||
sarg = replacechars(arg, directive == "^" ? superscripts : subscripts)
|
||||
!isempty(sarg) && return sarg
|
||||
end
|
||||
return s # ignore unrecognized directives
|
||||
end
|
||||
|
||||
# replace all latex directives in `s` via `directive_substitution`
|
||||
replace_directives(s::AbstractString, extra_directives::Associative{String,String}) =
|
||||
replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives))
|
||||
|
||||
# strip unescaped $ signs from s
|
||||
function strip_dollars(s::Union{String,SubString{String}})
|
||||
buf = IOBuffer()
|
||||
p = Vector{UInt8}(s)
|
||||
for i = 1:sizeof(s)
|
||||
c = p[i]
|
||||
if c == BACKSLASH && i < sizeof(s) && p[i+1] == DOLLAR
|
||||
write(buf, DOLLAR) # \$ -> $
|
||||
elseif c != DOLLAR
|
||||
write(buf, c)
|
||||
end
|
||||
end
|
||||
return String(take!(buf))
|
||||
end
|
||||
|
||||
"""
|
||||
simplify_latex(s::AbstractString, extra_directives=BibTeX.text_directives)
|
||||
|
||||
Simplify a LaTeX string `s` into "plain text" if possible, stripping/converting
|
||||
known LaTeX directives in favor of e.g Unicode.
|
||||
|
||||
`extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives
|
||||
to replacements. It defaults to `BibTeX.text_directives`, which simply strips
|
||||
out things like bold and italics. Alternatively, you can pass `BibTeX.md_directives`,
|
||||
which uses Markdown syntax for such directives.
|
||||
"""
|
||||
simplify_latex(s::AbstractString, extra_directives::Associative{String,String}=text_directives) =
|
||||
strip_dollars(replace_directives(s, extra_directives))
|
|
@ -47,3 +47,9 @@ end
|
|||
@test isempty(similar(x2))
|
||||
@test isempty(sizehint!(empty!(x2),10))
|
||||
end
|
||||
|
||||
import BibTeX: simplify_latex, md_directives
|
||||
@testset "latex" begin
|
||||
@test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) ==
|
||||
"foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û"
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue