diff --git a/src/latex.jl b/src/latex.jl index 735b186..9dd7f59 100644 --- a/src/latex.jl +++ b/src/latex.jl @@ -14,6 +14,8 @@ const BRACEOPEN = UInt8('{') const BRACECLOSE = UInt8('}') const SPACE = UInt8(' ') const DOLLAR = UInt8('$') +const CARET = UInt8('^') +const UNDERSCORE = UInt8('_') isalpha8(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z') isalnum8(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || isalpha8(x) @@ -33,22 +35,28 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 allspaces=true # find \foo directive or {...}: + c = UInt8(0) while i ≤ e c = unsafe_load(p, i) - (c == BACKSLASH || c == BRACEOPEN) && break + (c == BACKSLASH || c == BRACEOPEN || c == CARET || c == UNDERSCORE) && break c != SPACE && (allspaces = false) i += 1 end - if i ≤ e && unsafe_load(p, i) == BACKSLASH + if i ≤ e && c != BRACEOPEN directive_start = i - i += 2 - i-1 > e && return 0,0,0 - if isalpha8(unsafe_load(p, i-1)) - while i ≤ e && isalpha8(unsafe_load(p, i)) - i += 1 + if c == BACKSLASH + i += 2 + i-1 > e && return 0,0,0 + if isalpha8(unsafe_load(p, i-1)) + while i ≤ e && isalpha8(unsafe_load(p, i)) + i += 1 + end end + directive_end = i-1 + else + directive_end = directive_start # ^ or _ + i += 1 end - directive_end = i-1 # look for optional opening brace while i ≤ e && unsafe_load(p, i) == SPACE @@ -63,18 +71,26 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 j -= 1 end if j < istart || unsafe_load(p, j) != BRACEOPEN - # argument is not in braces … get next token - if allspaces + if unsafe_load(p, i) == BACKSLASH + # argument is another latex directive + ds,de,ae = search_latexdirective(s, i) + return directive_start, directive_end, ae + elseif c != BACKSLASH + # in an equation, token is a single char + return directive_start, directive_end, i + elseif allspaces # if `\directive ...` was preceded only # by whitespace, then assume arguments # extend to the end of the string. This # happens when we recurse on `{\directive ...}`. return directive_start, directive_end, e + else + # argument is not in braces … get next token + while i ≤ e && isalnum8(unsafe_load(p, i)) + i += 1 + end + return directive_start, directive_end, i-1 end - while i ≤ e && isalnum8(unsafe_load(p, i)) - i += 1 - end - return directive_start, directive_end, i-1 end end i += 1 @@ -173,6 +189,7 @@ const latex_unicode = Dict{String,String}( "\\\\" => "\\", "\\{" => "{", "\\}" => "}", "\\%" => "%", + # "\\\$" => "\$" -- dollar signs will be unescaped in strip_dollars # We parse {....} quoting as an empty directive: "" => "#1", @@ -186,9 +203,12 @@ const md_directives = Dict{String,String}( "\\emph" => "_#1_", "\\textit" => "_#1_", "\\it" => "_#1_", + "\\mathit" => "_#1_", "\\textbf" => "**#1**", "\\bf" => "**#1**", + "\\mathbf" => "**#1**", "\\texttt" => "`#1`", + "\\mathrm" => "#1", "\\url" => "[#1](#1)", "\\sout" => "~~#1~~", "\\st" => "~~#1~~", @@ -201,15 +221,43 @@ const text_directives = Dict{String,String}( "\\emph" => "#1", "\\textit" => "#1", "\\it" => "#1", + "\\mathit" => "#1", "\\textbf" => "#1", "\\bf" => "#1", + "\\mathbf" => "#1", "\\texttt" => "#1", + "\\mathrm" => "#1", "\\url" => "#1", - "\\sout" => "#1", - "\\st" => "#1", - "\\cancel" => "#1", ) +# Unicode includes an incomplete set of super/subscript characters: +const superscripts = Dict{Char,Char}( + '0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹', + 'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ', + 'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ', + 'r'=>'ʳ', 's'=>'ˢ', 't'=>'ᵗ', 'u'=>'ᵘ', 'v'=>'ᵛ', 'w'=>'ʷ', 'x'=>'ˣ', 'y'=>'ʸ', 'z'=>'ᶻ', + 'A'=>'ᴬ', 'B'=>'ᴮ', 'C'=>'ᶜ', 'D'=>'ᴰ', 'E'=>'ᴱ', 'G'=>'ᴳ', 'H'=>'ᴴ', 'I'=>'ᴵ', 'J'=>'ᴶ', + 'K'=>'ᴷ', 'L'=>'ᴸ', 'M'=>'ᴹ', 'N'=>'ᴺ', 'O'=>'ᴼ', 'P'=>'ᴾ', 'R'=>'ᴿ', 'S'=>'ˢ', 'T'=>'ᵀ', + 'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ', + '+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', +) +const subscripts = Dict{Char,Char}( + '0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉', + 'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ', + 'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ', + 'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ', + '-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ', +) +function replacechars(s::AbstractString, charmap::Associative{Char,Char}) + buf = IOBuffer() + for c in s + cm = get(charmap, c, '\0') + cm == '\0' && return "" + print(buf, cm) + end + return String(take!(buf)) +end + # Given a (sub)string `s` that represents a LaTeX directive matched # by search_latexdirective, performs our Unicode substitutions and # also any additional substitutions given by extra_directives. @@ -233,9 +281,43 @@ function directive_substitution(s::AbstractString, extra_directives::Associative end end end + if directive == "^" || directive == "_" # super/subscripts + arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives)) + sarg = replacechars(arg, directive == "^" ? superscripts : subscripts) + !isempty(sarg) && return sarg + end return s # ignore unrecognized directives end # replace all latex directives in `s` via `directive_substitution` replace_directives(s::AbstractString, extra_directives::Associative{String,String}) = replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives)) + +# strip unescaped $ signs from s +function strip_dollars(s::Union{String,SubString{String}}) + buf = IOBuffer() + p = pointer(s) + for i = 1:sizeof(s) + c = unsafe_load(p, i) + if c == BACKSLASH && i < sizeof(s) && unsafe_load(p, i+1) == DOLLAR + write(buf, DOLLAR) # \$ -> $ + elseif c != DOLLAR + write(buf, c) + end + end + return String(take!(buf)) +end + +""" + simplify_latex(s::AbstractString, extra_directives=BibTeX.text_directives) + +Simplify a LaTeX string `s` into "plain text" if possible, stripping/converting +known LaTeX directives in favor of e.g Unicode. + +`extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives +to replacements. It defaults to `BibTeX.text_directives`, which simply strips +out things like bold and italics. Alternatively, you can pass `BibTeX.md_directives`, +which uses Markdown syntax for such directives. +""" +simplify_latex(s::AbstractString, extra_directives::Associative{String,String}=text_directives) = + strip_dollars(replace_directives(s, extra_directives)) diff --git a/test/runtests.jl b/test/runtests.jl index 4a19eb8..0904837 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -48,8 +48,8 @@ end @test isempty(sizehint!(empty!(x2),10)) end -import BibTeX: replace_directives, md_directives +import BibTeX: simplify_latex, md_directives @testset "latex" begin - @test replace_directives(raw"foo \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) == - "foo α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û" + @test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) == + "foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û" end