rudimentary equation support (sub/superscripts, elimination of dollar signs)
This commit is contained in:
parent
ab60bb59d8
commit
26bfe8d705
116
src/latex.jl
116
src/latex.jl
|
@ -14,6 +14,8 @@ const BRACEOPEN = UInt8('{')
|
||||||
const BRACECLOSE = UInt8('}')
|
const BRACECLOSE = UInt8('}')
|
||||||
const SPACE = UInt8(' ')
|
const SPACE = UInt8(' ')
|
||||||
const DOLLAR = UInt8('$')
|
const DOLLAR = UInt8('$')
|
||||||
|
const CARET = UInt8('^')
|
||||||
|
const UNDERSCORE = UInt8('_')
|
||||||
isalpha8(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z')
|
isalpha8(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z')
|
||||||
isalnum8(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || isalpha8(x)
|
isalnum8(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || isalpha8(x)
|
||||||
|
|
||||||
|
@ -33,22 +35,28 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1
|
||||||
allspaces=true
|
allspaces=true
|
||||||
|
|
||||||
# find \foo directive or {...}:
|
# find \foo directive or {...}:
|
||||||
|
c = UInt8(0)
|
||||||
while i ≤ e
|
while i ≤ e
|
||||||
c = unsafe_load(p, i)
|
c = unsafe_load(p, i)
|
||||||
(c == BACKSLASH || c == BRACEOPEN) && break
|
(c == BACKSLASH || c == BRACEOPEN || c == CARET || c == UNDERSCORE) && break
|
||||||
c != SPACE && (allspaces = false)
|
c != SPACE && (allspaces = false)
|
||||||
i += 1
|
i += 1
|
||||||
end
|
end
|
||||||
if i ≤ e && unsafe_load(p, i) == BACKSLASH
|
if i ≤ e && c != BRACEOPEN
|
||||||
directive_start = i
|
directive_start = i
|
||||||
i += 2
|
if c == BACKSLASH
|
||||||
i-1 > e && return 0,0,0
|
i += 2
|
||||||
if isalpha8(unsafe_load(p, i-1))
|
i-1 > e && return 0,0,0
|
||||||
while i ≤ e && isalpha8(unsafe_load(p, i))
|
if isalpha8(unsafe_load(p, i-1))
|
||||||
i += 1
|
while i ≤ e && isalpha8(unsafe_load(p, i))
|
||||||
|
i += 1
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
directive_end = i-1
|
||||||
|
else
|
||||||
|
directive_end = directive_start # ^ or _
|
||||||
|
i += 1
|
||||||
end
|
end
|
||||||
directive_end = i-1
|
|
||||||
|
|
||||||
# look for optional opening brace
|
# look for optional opening brace
|
||||||
while i ≤ e && unsafe_load(p, i) == SPACE
|
while i ≤ e && unsafe_load(p, i) == SPACE
|
||||||
|
@ -63,18 +71,26 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1
|
||||||
j -= 1
|
j -= 1
|
||||||
end
|
end
|
||||||
if j < istart || unsafe_load(p, j) != BRACEOPEN
|
if j < istart || unsafe_load(p, j) != BRACEOPEN
|
||||||
# argument is not in braces … get next token
|
if unsafe_load(p, i) == BACKSLASH
|
||||||
if allspaces
|
# argument is another latex directive
|
||||||
|
ds,de,ae = search_latexdirective(s, i)
|
||||||
|
return directive_start, directive_end, ae
|
||||||
|
elseif c != BACKSLASH
|
||||||
|
# in an equation, token is a single char
|
||||||
|
return directive_start, directive_end, i
|
||||||
|
elseif allspaces
|
||||||
# if `\directive ...` was preceded only
|
# if `\directive ...` was preceded only
|
||||||
# by whitespace, then assume arguments
|
# by whitespace, then assume arguments
|
||||||
# extend to the end of the string. This
|
# extend to the end of the string. This
|
||||||
# happens when we recurse on `{\directive ...}`.
|
# happens when we recurse on `{\directive ...}`.
|
||||||
return directive_start, directive_end, e
|
return directive_start, directive_end, e
|
||||||
|
else
|
||||||
|
# argument is not in braces … get next token
|
||||||
|
while i ≤ e && isalnum8(unsafe_load(p, i))
|
||||||
|
i += 1
|
||||||
|
end
|
||||||
|
return directive_start, directive_end, i-1
|
||||||
end
|
end
|
||||||
while i ≤ e && isalnum8(unsafe_load(p, i))
|
|
||||||
i += 1
|
|
||||||
end
|
|
||||||
return directive_start, directive_end, i-1
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
i += 1
|
i += 1
|
||||||
|
@ -173,6 +189,7 @@ const latex_unicode = Dict{String,String}(
|
||||||
"\\\\" => "\\",
|
"\\\\" => "\\",
|
||||||
"\\{" => "{", "\\}" => "}",
|
"\\{" => "{", "\\}" => "}",
|
||||||
"\\%" => "%",
|
"\\%" => "%",
|
||||||
|
# "\\\$" => "\$" -- dollar signs will be unescaped in strip_dollars
|
||||||
|
|
||||||
# We parse {....} quoting as an empty directive:
|
# We parse {....} quoting as an empty directive:
|
||||||
"" => "#1",
|
"" => "#1",
|
||||||
|
@ -186,9 +203,12 @@ const md_directives = Dict{String,String}(
|
||||||
"\\emph" => "_#1_",
|
"\\emph" => "_#1_",
|
||||||
"\\textit" => "_#1_",
|
"\\textit" => "_#1_",
|
||||||
"\\it" => "_#1_",
|
"\\it" => "_#1_",
|
||||||
|
"\\mathit" => "_#1_",
|
||||||
"\\textbf" => "**#1**",
|
"\\textbf" => "**#1**",
|
||||||
"\\bf" => "**#1**",
|
"\\bf" => "**#1**",
|
||||||
|
"\\mathbf" => "**#1**",
|
||||||
"\\texttt" => "`#1`",
|
"\\texttt" => "`#1`",
|
||||||
|
"\\mathrm" => "#1",
|
||||||
"\\url" => "[#1](#1)",
|
"\\url" => "[#1](#1)",
|
||||||
"\\sout" => "~~#1~~",
|
"\\sout" => "~~#1~~",
|
||||||
"\\st" => "~~#1~~",
|
"\\st" => "~~#1~~",
|
||||||
|
@ -201,15 +221,43 @@ const text_directives = Dict{String,String}(
|
||||||
"\\emph" => "#1",
|
"\\emph" => "#1",
|
||||||
"\\textit" => "#1",
|
"\\textit" => "#1",
|
||||||
"\\it" => "#1",
|
"\\it" => "#1",
|
||||||
|
"\\mathit" => "#1",
|
||||||
"\\textbf" => "#1",
|
"\\textbf" => "#1",
|
||||||
"\\bf" => "#1",
|
"\\bf" => "#1",
|
||||||
|
"\\mathbf" => "#1",
|
||||||
"\\texttt" => "#1",
|
"\\texttt" => "#1",
|
||||||
|
"\\mathrm" => "#1",
|
||||||
"\\url" => "#1",
|
"\\url" => "#1",
|
||||||
"\\sout" => "#1",
|
|
||||||
"\\st" => "#1",
|
|
||||||
"\\cancel" => "#1",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Unicode includes an incomplete set of super/subscript characters:
|
||||||
|
const superscripts = Dict{Char,Char}(
|
||||||
|
'0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹',
|
||||||
|
'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ',
|
||||||
|
'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ',
|
||||||
|
'r'=>'ʳ', 's'=>'ˢ', 't'=>'ᵗ', 'u'=>'ᵘ', 'v'=>'ᵛ', 'w'=>'ʷ', 'x'=>'ˣ', 'y'=>'ʸ', 'z'=>'ᶻ',
|
||||||
|
'A'=>'ᴬ', 'B'=>'ᴮ', 'C'=>'ᶜ', 'D'=>'ᴰ', 'E'=>'ᴱ', 'G'=>'ᴳ', 'H'=>'ᴴ', 'I'=>'ᴵ', 'J'=>'ᴶ',
|
||||||
|
'K'=>'ᴷ', 'L'=>'ᴸ', 'M'=>'ᴹ', 'N'=>'ᴺ', 'O'=>'ᴼ', 'P'=>'ᴾ', 'R'=>'ᴿ', 'S'=>'ˢ', 'T'=>'ᵀ',
|
||||||
|
'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ',
|
||||||
|
'+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ',
|
||||||
|
)
|
||||||
|
const subscripts = Dict{Char,Char}(
|
||||||
|
'0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉',
|
||||||
|
'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ',
|
||||||
|
'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ',
|
||||||
|
'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ',
|
||||||
|
'-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ',
|
||||||
|
)
|
||||||
|
function replacechars(s::AbstractString, charmap::Associative{Char,Char})
|
||||||
|
buf = IOBuffer()
|
||||||
|
for c in s
|
||||||
|
cm = get(charmap, c, '\0')
|
||||||
|
cm == '\0' && return ""
|
||||||
|
print(buf, cm)
|
||||||
|
end
|
||||||
|
return String(take!(buf))
|
||||||
|
end
|
||||||
|
|
||||||
# Given a (sub)string `s` that represents a LaTeX directive matched
|
# Given a (sub)string `s` that represents a LaTeX directive matched
|
||||||
# by search_latexdirective, performs our Unicode substitutions and
|
# by search_latexdirective, performs our Unicode substitutions and
|
||||||
# also any additional substitutions given by extra_directives.
|
# also any additional substitutions given by extra_directives.
|
||||||
|
@ -233,9 +281,43 @@ function directive_substitution(s::AbstractString, extra_directives::Associative
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
if directive == "^" || directive == "_" # super/subscripts
|
||||||
|
arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives))
|
||||||
|
sarg = replacechars(arg, directive == "^" ? superscripts : subscripts)
|
||||||
|
!isempty(sarg) && return sarg
|
||||||
|
end
|
||||||
return s # ignore unrecognized directives
|
return s # ignore unrecognized directives
|
||||||
end
|
end
|
||||||
|
|
||||||
# replace all latex directives in `s` via `directive_substitution`
|
# replace all latex directives in `s` via `directive_substitution`
|
||||||
replace_directives(s::AbstractString, extra_directives::Associative{String,String}) =
|
replace_directives(s::AbstractString, extra_directives::Associative{String,String}) =
|
||||||
replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives))
|
replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives))
|
||||||
|
|
||||||
|
# strip unescaped $ signs from s
|
||||||
|
function strip_dollars(s::Union{String,SubString{String}})
|
||||||
|
buf = IOBuffer()
|
||||||
|
p = pointer(s)
|
||||||
|
for i = 1:sizeof(s)
|
||||||
|
c = unsafe_load(p, i)
|
||||||
|
if c == BACKSLASH && i < sizeof(s) && unsafe_load(p, i+1) == DOLLAR
|
||||||
|
write(buf, DOLLAR) # \$ -> $
|
||||||
|
elseif c != DOLLAR
|
||||||
|
write(buf, c)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return String(take!(buf))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
simplify_latex(s::AbstractString, extra_directives=BibTeX.text_directives)
|
||||||
|
|
||||||
|
Simplify a LaTeX string `s` into "plain text" if possible, stripping/converting
|
||||||
|
known LaTeX directives in favor of e.g Unicode.
|
||||||
|
|
||||||
|
`extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives
|
||||||
|
to replacements. It defaults to `BibTeX.text_directives`, which simply strips
|
||||||
|
out things like bold and italics. Alternatively, you can pass `BibTeX.md_directives`,
|
||||||
|
which uses Markdown syntax for such directives.
|
||||||
|
"""
|
||||||
|
simplify_latex(s::AbstractString, extra_directives::Associative{String,String}=text_directives) =
|
||||||
|
strip_dollars(replace_directives(s, extra_directives))
|
||||||
|
|
|
@ -48,8 +48,8 @@ end
|
||||||
@test isempty(sizehint!(empty!(x2),10))
|
@test isempty(sizehint!(empty!(x2),10))
|
||||||
end
|
end
|
||||||
|
|
||||||
import BibTeX: replace_directives, md_directives
|
import BibTeX: simplify_latex, md_directives
|
||||||
@testset "latex" begin
|
@testset "latex" begin
|
||||||
@test replace_directives(raw"foo \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) ==
|
@test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) ==
|
||||||
"foo α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û"
|
"foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û"
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue