From 9c8c04e950184ae668fb67c9ea9c29e0e850bcba Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 2 Aug 2017 12:44:40 -0400 Subject: [PATCH 1/4] initial attempt at latex substitutions --- src/BibTeX.jl | 1 + src/latex.jl | 108 +++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 6 +++ 3 files changed, 115 insertions(+) create mode 100644 src/latex.jl diff --git a/src/BibTeX.jl b/src/BibTeX.jl index b17118d..5827123 100644 --- a/src/BibTeX.jl +++ b/src/BibTeX.jl @@ -4,5 +4,6 @@ export Bibliography, Citation include("parser.jl") include("citation.jl") include("bibliography.jl") +include("latex.jl") end diff --git a/src/latex.jl b/src/latex.jl new file mode 100644 index 0000000..9459448 --- /dev/null +++ b/src/latex.jl @@ -0,0 +1,108 @@ +# conversion of LaTeX directives to plain text, markdown, etc. +# +# The basic idea is that we search for `\foo{arg}`, `{\foo arg}`, +# or `{\foo{arg}}`, and look up `foo` in a dictionary of substitutions +# like `textit` -> `*#1*` where #1 is where the (first) argument is +# substituted. Then we have separate dictionary entries for text/plain, +# text/markdown, etcetera. + +# regex matching (directive,arg) +const latex_directive = r"\\(\W|[A-Za-z]+) *\{([^}]*)\}|\{ *\\(\W|[A-Za-z]+) *(\{([^}]*)\}|[^}]*)\}|\\(\W|[A-Za-z]+) *(\w*)" + +# given a match m to latex_directive, return (directive,arg) +function extract_directive(m::RegexMatch) + m.captures[1] !== nothing && return (m.captures[1], m.captures[2]) + m.captures[3] !== nothing && return (m.captures[3], m.captures[5] === nothing ? m.captures[4] : m.captures[5]) + m.captures[6] !== nothing && return (m.captures[6], m.captures[7]) + throw(ArgumentError("unknown latex_directive match")) +end + +# Unicode substitutions for LaTeX directives +const latex_unicode = Dict{String,String}( + # accent escapes like `\"u` for `ü`, from the list at + # https://en.wikibooks.org/wiki/LaTeX/Special_Characters + # converted to LaTeX characters (mostly combining marks) + "`" => "#1\u0300", + "'" => "#1\u0301", + "^" => "#1\u0302", + "\"" => "#1\u0308", + "H" => "#1\u030b", + "~" => "#1\u0303", + "c" => "#1\u0327", + "k" => "#1\u0328", + "l" => "\u0142#1", + "=" => "#1\u0304", + "b" => "#1\u0331", + "." => "#1\u0307", + "d" => "#1\u0323", + "r" => "#1\u030a", + "u" => "#1\u0306", + "v" => "#1\u030c", + "t" => "#1\u0361", # fixme: u+0361 should go after first char in #1 + "o" => "\u00f8#1", + "i" => "\u0131#1", + "j" => "\u0237#1", + + # many other substitutions can be found in + # Base.REPLCompletions.latex_symbols +) + +# LaTeX directives converted to Markdown +const md_directives = Dict{String,String}( + "emph" => "_#1_", + "textit" => "_#1_", + "it" => "_#1_", + "textbf" => "**#1**", + "bf" => "**#1**", + "texttt" => "`#1`", + "url" => "[#1](#1)", + "sout" => "~~#1~~", + "st" => "~~#1~~", + "cancel" => "~~#1~~", +) + +# directives that are stripped when converting +# to text/plain +const text_directives = Dict{String,String}( + "emph" => "#1", + "textit" => "#1", + "it" => "#1", + "textbf" => "#1", + "bf" => "#1", + "texttt" => "#1", + "url" => "#1", + "sout" => "#1", + "st" => "#1", + "cancel" => "#1", +) + +# Given a string `s` that matches the latex_directive regex, +# return a new string to replace it with. We perform substitutions +# based on extra_directives as well as on latex_unicode, from above. +function directive_substitution(s::AbstractString, extra_directives::Associative{String,String}) + m = match(latex_directive, s) + m === nothing && return s + directive, arg_ = extract_directive(m) + arg = replace_directives(arg_, extra_directives) # recursively replace in args + if haskey(extra_directives, directive) + return replace(extra_directives[directive], "#1", arg) + elseif haskey(latex_unicode, directive) + return replace(latex_unicode[directive], "#1", arg) + else + bdir = string('\\', directive) + if haskey(Base.REPLCompletions.latex_symbols, bdir) + sym = Base.REPLCompletions.latex_symbols[bdir] + if isempty(sym) || strwidth(sym) > 0 + return string(sym, arg) + else + return string(arg, sym) # combining character like \hat{x} + end + else + return s # no substitutions found + end + end +end + +# replace all latex directives in `s` via `directive_substitution` +replace_directives(s::AbstractString, extra_directives::Associative{String,String}) = + replace(s, latex_directive, sub -> directive_substitution(sub, extra_directives)) diff --git a/test/runtests.jl b/test/runtests.jl index 0831462..944fc38 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -47,3 +47,9 @@ end @test isempty(similar(x2)) @test isempty(sizehint!(empty!(x2),10)) end + +import BibTeX: replace_directives, md_directives +@testset "latex" begin + @test replace_directives(raw"foo \emph{bar} {\bf baz} {\^{u}}", md_directives) == + "foo _bar_ **baz** û" +end From ab60bb59d84e8463b05aa43929dacbf6715b8983 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 3 Aug 2017 09:40:14 -0400 Subject: [PATCH 2/4] implement custom directive parser --- src/latex.jl | 273 +++++++++++++++++++++++++++++++++++------------ test/runtests.jl | 4 +- 2 files changed, 205 insertions(+), 72 deletions(-) diff --git a/src/latex.jl b/src/latex.jl index 9459448..735b186 100644 --- a/src/latex.jl +++ b/src/latex.jl @@ -2,46 +2,180 @@ # # The basic idea is that we search for `\foo{arg}`, `{\foo arg}`, # or `{\foo{arg}}`, and look up `foo` in a dictionary of substitutions -# like `textit` -> `*#1*` where #1 is where the (first) argument is +# like `\textit` -> `*#1*` where #1 is where the (first) argument is # substituted. Then we have separate dictionary entries for text/plain, # text/markdown, etcetera. -# regex matching (directive,arg) -const latex_directive = r"\\(\W|[A-Za-z]+) *\{([^}]*)\}|\{ *\\(\W|[A-Za-z]+) *(\{([^}]*)\}|[^}]*)\}|\\(\W|[A-Za-z]+) *(\w*)" +########################################################################### +# parsing LaTeX directives: -# given a match m to latex_directive, return (directive,arg) -function extract_directive(m::RegexMatch) - m.captures[1] !== nothing && return (m.captures[1], m.captures[2]) - m.captures[3] !== nothing && return (m.captures[3], m.captures[5] === nothing ? m.captures[4] : m.captures[5]) - m.captures[6] !== nothing && return (m.captures[6], m.captures[7]) - throw(ArgumentError("unknown latex_directive match")) +const BACKSLASH = UInt8('\\') +const BRACEOPEN = UInt8('{') +const BRACECLOSE = UInt8('}') +const SPACE = UInt8(' ') +const DOLLAR = UInt8('$') +isalpha8(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z') +isalnum8(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || isalpha8(x) + +""" + search_latexdirective(string, istart=1, inbrace=false) + +Search for a LaTeX directive \\directive{arg} or similar in `string`, returning +`(ds, de, ae)` such that `string[ds:de]` gives `\\directive` and `string[de+1:ae]` +gives `{arg}`. Use [`striparg`](@ref) to remove surrounding braces and whitespace +from the `arg`. +""" +function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1) + e = sizeof(s) + 0 < istart ≤ e || return 0,0,0 + p = pointer(s) + i = istart + allspaces=true + + # find \foo directive or {...}: + while i ≤ e + c = unsafe_load(p, i) + (c == BACKSLASH || c == BRACEOPEN) && break + c != SPACE && (allspaces = false) + i += 1 + end + if i ≤ e && unsafe_load(p, i) == BACKSLASH + directive_start = i + i += 2 + i-1 > e && return 0,0,0 + if isalpha8(unsafe_load(p, i-1)) + while i ≤ e && isalpha8(unsafe_load(p, i)) + i += 1 + end + end + directive_end = i-1 + + # look for optional opening brace + while i ≤ e && unsafe_load(p, i) == SPACE + i += 1 + end + i > e && return directive_start, directive_end, e + inbrace = unsafe_load(p, i) == BRACEOPEN + if !inbrace + # search backwards from \foo to look for { \foo ...} + j = directive_start - 1 + while j ≥ istart && unsafe_load(p, j) == SPACE + j -= 1 + end + if j < istart || unsafe_load(p, j) != BRACEOPEN + # argument is not in braces … get next token + if allspaces + # if `\directive ...` was preceded only + # by whitespace, then assume arguments + # extend to the end of the string. This + # happens when we recurse on `{\directive ...}`. + return directive_start, directive_end, e + end + while i ≤ e && isalnum8(unsafe_load(p, i)) + i += 1 + end + return directive_start, directive_end, i-1 + end + end + i += 1 + elseif i > e + return 0,0,0 + else # { ... } + directive_start = i + directive_end = i - 1 + inbrace = true + i += 1 + end + + # search for end of argument (closing brace) + nbraces = 1 + while i ≤ e + c = unsafe_load(p, i) + if c == BRACEOPEN + nbraces += 1 + elseif c == BRACECLOSE + nbraces -= 1 + if nbraces == 0 + return directive_start, directive_end, inbrace ? i : i-1 + end + end + i += 1 + end + return directive_start, directive_end, e end +""" + striparg(s, argstart=start(s), argend=endof(s)) + +Return the substring of `s` corresponding to the argument from `argstart:argend`, stripping +leading/trailing whitespace and braces. +""" +function striparg(s::Union{String,SubString{String}}, argstart::Int=start(s), argend::Int=endof(s)) + e = endof(s) + (1 ≤ argstart ≤ e && 1 ≤ argend ≤ e) || throw(BoundsError()) + + p = pointer(s) + if unsafe_load(p, argend) == BRACECLOSE + argend -= 1 # omit brace + while argstart ≤ argend && unsafe_load(p, argstart) != BRACEOPEN + argstart += 1 + end + argstart > argend && error("malformed argument") + argstart += 1 # omit brace + end + while argstart ≤ argend && unsafe_load(p, argend) == SPACE + argend -= 1 + end + while argstart ≤ argend && unsafe_load(p, argstart) == SPACE + argstart += 1 + end + return SubString(s, argstart, argend) +end + +# to make replace work for LaTeX directives with our +# custom search function, all we need to do is to define +# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, i) +# returns the range of the directive +struct LaTeXDirectiveSearch; end +function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, i::Integer) + ds, de, ae = search_latexdirective(s, i) + return ds < i ? (0:-1) : (ds:ae) +end +########################################################################### + # Unicode substitutions for LaTeX directives const latex_unicode = Dict{String,String}( # accent escapes like `\"u` for `ü`, from the list at # https://en.wikibooks.org/wiki/LaTeX/Special_Characters # converted to LaTeX characters (mostly combining marks) - "`" => "#1\u0300", - "'" => "#1\u0301", - "^" => "#1\u0302", - "\"" => "#1\u0308", - "H" => "#1\u030b", - "~" => "#1\u0303", - "c" => "#1\u0327", - "k" => "#1\u0328", - "l" => "\u0142#1", - "=" => "#1\u0304", - "b" => "#1\u0331", - "." => "#1\u0307", - "d" => "#1\u0323", - "r" => "#1\u030a", - "u" => "#1\u0306", - "v" => "#1\u030c", - "t" => "#1\u0361", # fixme: u+0361 should go after first char in #1 - "o" => "\u00f8#1", - "i" => "\u0131#1", - "j" => "\u0237#1", + "\\`" => "#1\u0300", + "\\'" => "#1\u0301", + "\\^" => "#1\u0302", + "\\\"" => "#1\u0308", + "\\H" => "#1\u030b", + "\\~" => "#1\u0303", + "\\c" => "#1\u0327", + "\\k" => "#1\u0328", + "\\l" => "\u0142", + "\\=" => "#1\u0304", + "\\b" => "#1\u0331", + "\\." => "#1\u0307", + "\\d" => "#1\u0323", + "\\r" => "#1\u030a", + "\\u" => "#1\u0306", + "\\v" => "#1\u030c", + "\\t" => "#1\u0361", # fixme: u+0361 should go after first char in #1 + "\\o" => "\u00f8", + "\\i" => "\u0131", + "\\j" => "\u0237", + + # other backslash escapes + "\\\\" => "\\", + "\\{" => "{", "\\}" => "}", + "\\%" => "%", + + # We parse {....} quoting as an empty directive: + "" => "#1", # many other substitutions can be found in # Base.REPLCompletions.latex_symbols @@ -49,60 +183,59 @@ const latex_unicode = Dict{String,String}( # LaTeX directives converted to Markdown const md_directives = Dict{String,String}( - "emph" => "_#1_", - "textit" => "_#1_", - "it" => "_#1_", - "textbf" => "**#1**", - "bf" => "**#1**", - "texttt" => "`#1`", - "url" => "[#1](#1)", - "sout" => "~~#1~~", - "st" => "~~#1~~", - "cancel" => "~~#1~~", + "\\emph" => "_#1_", + "\\textit" => "_#1_", + "\\it" => "_#1_", + "\\textbf" => "**#1**", + "\\bf" => "**#1**", + "\\texttt" => "`#1`", + "\\url" => "[#1](#1)", + "\\sout" => "~~#1~~", + "\\st" => "~~#1~~", + "\\cancel" => "~~#1~~", ) # directives that are stripped when converting # to text/plain const text_directives = Dict{String,String}( - "emph" => "#1", - "textit" => "#1", - "it" => "#1", - "textbf" => "#1", - "bf" => "#1", - "texttt" => "#1", - "url" => "#1", - "sout" => "#1", - "st" => "#1", - "cancel" => "#1", + "\\emph" => "#1", + "\\textit" => "#1", + "\\it" => "#1", + "\\textbf" => "#1", + "\\bf" => "#1", + "\\texttt" => "#1", + "\\url" => "#1", + "\\sout" => "#1", + "\\st" => "#1", + "\\cancel" => "#1", ) -# Given a string `s` that matches the latex_directive regex, -# return a new string to replace it with. We perform substitutions -# based on extra_directives as well as on latex_unicode, from above. +# Given a (sub)string `s` that represents a LaTeX directive matched +# by search_latexdirective, performs our Unicode substitutions and +# also any additional substitutions given by extra_directives. function directive_substitution(s::AbstractString, extra_directives::Associative{String,String}) - m = match(latex_directive, s) - m === nothing && return s - directive, arg_ = extract_directive(m) - arg = replace_directives(arg_, extra_directives) # recursively replace in args - if haskey(extra_directives, directive) - return replace(extra_directives[directive], "#1", arg) - elseif haskey(latex_unicode, directive) - return replace(latex_unicode[directive], "#1", arg) - else - bdir = string('\\', directive) - if haskey(Base.REPLCompletions.latex_symbols, bdir) - sym = Base.REPLCompletions.latex_symbols[bdir] - if isempty(sym) || strwidth(sym) > 0 - return string(sym, arg) + ds, de = search_latexdirective(s) + ae = endof(s) + directive = SubString(s, ds, de) + for dict in (extra_directives, latex_unicode, Base.REPLCompletions.latex_symbols) + if haskey(dict, directive) + sub = dict[directive] + if contains(sub, "#1") + arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives)) + return replace(sub, "#1", arg) else - return string(arg, sym) # combining character like \hat{x} + arg = replace_directives(SubString(s, de+1, ae), extra_directives) + if strwidth(sub) == 0 # \hat{...} etc: combining chars go after argument + return string(striparg(arg), sub) + else + return string(sub, arg) # don't strip for 0-arg macros + end end - else - return s # no substitutions found end end + return s # ignore unrecognized directives end # replace all latex directives in `s` via `directive_substitution` replace_directives(s::AbstractString, extra_directives::Associative{String,String}) = - replace(s, latex_directive, sub -> directive_substitution(sub, extra_directives)) + replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives)) diff --git a/test/runtests.jl b/test/runtests.jl index 944fc38..4a19eb8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -50,6 +50,6 @@ end import BibTeX: replace_directives, md_directives @testset "latex" begin - @test replace_directives(raw"foo \emph{bar} {\bf baz} {\^{u}}", md_directives) == - "foo _bar_ **baz** û" + @test replace_directives(raw"foo \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) == + "foo α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û" end From 26bfe8d705552d9fe4ba2d3d9ba2a918298e562f Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 3 Aug 2017 12:16:31 -0400 Subject: [PATCH 3/4] rudimentary equation support (sub/superscripts, elimination of dollar signs) --- src/latex.jl | 116 ++++++++++++++++++++++++++++++++++++++++------- test/runtests.jl | 6 +-- 2 files changed, 102 insertions(+), 20 deletions(-) diff --git a/src/latex.jl b/src/latex.jl index 735b186..9dd7f59 100644 --- a/src/latex.jl +++ b/src/latex.jl @@ -14,6 +14,8 @@ const BRACEOPEN = UInt8('{') const BRACECLOSE = UInt8('}') const SPACE = UInt8(' ') const DOLLAR = UInt8('$') +const CARET = UInt8('^') +const UNDERSCORE = UInt8('_') isalpha8(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z') isalnum8(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || isalpha8(x) @@ -33,22 +35,28 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 allspaces=true # find \foo directive or {...}: + c = UInt8(0) while i ≤ e c = unsafe_load(p, i) - (c == BACKSLASH || c == BRACEOPEN) && break + (c == BACKSLASH || c == BRACEOPEN || c == CARET || c == UNDERSCORE) && break c != SPACE && (allspaces = false) i += 1 end - if i ≤ e && unsafe_load(p, i) == BACKSLASH + if i ≤ e && c != BRACEOPEN directive_start = i - i += 2 - i-1 > e && return 0,0,0 - if isalpha8(unsafe_load(p, i-1)) - while i ≤ e && isalpha8(unsafe_load(p, i)) - i += 1 + if c == BACKSLASH + i += 2 + i-1 > e && return 0,0,0 + if isalpha8(unsafe_load(p, i-1)) + while i ≤ e && isalpha8(unsafe_load(p, i)) + i += 1 + end end + directive_end = i-1 + else + directive_end = directive_start # ^ or _ + i += 1 end - directive_end = i-1 # look for optional opening brace while i ≤ e && unsafe_load(p, i) == SPACE @@ -63,18 +71,26 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 j -= 1 end if j < istart || unsafe_load(p, j) != BRACEOPEN - # argument is not in braces … get next token - if allspaces + if unsafe_load(p, i) == BACKSLASH + # argument is another latex directive + ds,de,ae = search_latexdirective(s, i) + return directive_start, directive_end, ae + elseif c != BACKSLASH + # in an equation, token is a single char + return directive_start, directive_end, i + elseif allspaces # if `\directive ...` was preceded only # by whitespace, then assume arguments # extend to the end of the string. This # happens when we recurse on `{\directive ...}`. return directive_start, directive_end, e + else + # argument is not in braces … get next token + while i ≤ e && isalnum8(unsafe_load(p, i)) + i += 1 + end + return directive_start, directive_end, i-1 end - while i ≤ e && isalnum8(unsafe_load(p, i)) - i += 1 - end - return directive_start, directive_end, i-1 end end i += 1 @@ -173,6 +189,7 @@ const latex_unicode = Dict{String,String}( "\\\\" => "\\", "\\{" => "{", "\\}" => "}", "\\%" => "%", + # "\\\$" => "\$" -- dollar signs will be unescaped in strip_dollars # We parse {....} quoting as an empty directive: "" => "#1", @@ -186,9 +203,12 @@ const md_directives = Dict{String,String}( "\\emph" => "_#1_", "\\textit" => "_#1_", "\\it" => "_#1_", + "\\mathit" => "_#1_", "\\textbf" => "**#1**", "\\bf" => "**#1**", + "\\mathbf" => "**#1**", "\\texttt" => "`#1`", + "\\mathrm" => "#1", "\\url" => "[#1](#1)", "\\sout" => "~~#1~~", "\\st" => "~~#1~~", @@ -201,15 +221,43 @@ const text_directives = Dict{String,String}( "\\emph" => "#1", "\\textit" => "#1", "\\it" => "#1", + "\\mathit" => "#1", "\\textbf" => "#1", "\\bf" => "#1", + "\\mathbf" => "#1", "\\texttt" => "#1", + "\\mathrm" => "#1", "\\url" => "#1", - "\\sout" => "#1", - "\\st" => "#1", - "\\cancel" => "#1", ) +# Unicode includes an incomplete set of super/subscript characters: +const superscripts = Dict{Char,Char}( + '0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹', + 'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ', + 'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ', + 'r'=>'ʳ', 's'=>'ˢ', 't'=>'ᵗ', 'u'=>'ᵘ', 'v'=>'ᵛ', 'w'=>'ʷ', 'x'=>'ˣ', 'y'=>'ʸ', 'z'=>'ᶻ', + 'A'=>'ᴬ', 'B'=>'ᴮ', 'C'=>'ᶜ', 'D'=>'ᴰ', 'E'=>'ᴱ', 'G'=>'ᴳ', 'H'=>'ᴴ', 'I'=>'ᴵ', 'J'=>'ᴶ', + 'K'=>'ᴷ', 'L'=>'ᴸ', 'M'=>'ᴹ', 'N'=>'ᴺ', 'O'=>'ᴼ', 'P'=>'ᴾ', 'R'=>'ᴿ', 'S'=>'ˢ', 'T'=>'ᵀ', + 'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ', + '+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', +) +const subscripts = Dict{Char,Char}( + '0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉', + 'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ', + 'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ', + 'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ', + '-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ', +) +function replacechars(s::AbstractString, charmap::Associative{Char,Char}) + buf = IOBuffer() + for c in s + cm = get(charmap, c, '\0') + cm == '\0' && return "" + print(buf, cm) + end + return String(take!(buf)) +end + # Given a (sub)string `s` that represents a LaTeX directive matched # by search_latexdirective, performs our Unicode substitutions and # also any additional substitutions given by extra_directives. @@ -233,9 +281,43 @@ function directive_substitution(s::AbstractString, extra_directives::Associative end end end + if directive == "^" || directive == "_" # super/subscripts + arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives)) + sarg = replacechars(arg, directive == "^" ? superscripts : subscripts) + !isempty(sarg) && return sarg + end return s # ignore unrecognized directives end # replace all latex directives in `s` via `directive_substitution` replace_directives(s::AbstractString, extra_directives::Associative{String,String}) = replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives)) + +# strip unescaped $ signs from s +function strip_dollars(s::Union{String,SubString{String}}) + buf = IOBuffer() + p = pointer(s) + for i = 1:sizeof(s) + c = unsafe_load(p, i) + if c == BACKSLASH && i < sizeof(s) && unsafe_load(p, i+1) == DOLLAR + write(buf, DOLLAR) # \$ -> $ + elseif c != DOLLAR + write(buf, c) + end + end + return String(take!(buf)) +end + +""" + simplify_latex(s::AbstractString, extra_directives=BibTeX.text_directives) + +Simplify a LaTeX string `s` into "plain text" if possible, stripping/converting +known LaTeX directives in favor of e.g Unicode. + +`extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives +to replacements. It defaults to `BibTeX.text_directives`, which simply strips +out things like bold and italics. Alternatively, you can pass `BibTeX.md_directives`, +which uses Markdown syntax for such directives. +""" +simplify_latex(s::AbstractString, extra_directives::Associative{String,String}=text_directives) = + strip_dollars(replace_directives(s, extra_directives)) diff --git a/test/runtests.jl b/test/runtests.jl index 4a19eb8..0904837 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -48,8 +48,8 @@ end @test isempty(sizehint!(empty!(x2),10)) end -import BibTeX: replace_directives, md_directives +import BibTeX: simplify_latex, md_directives @testset "latex" begin - @test replace_directives(raw"foo \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) == - "foo α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û" + @test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) == + "foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û" end From 93ecaf5ccfc507299f204025f71833c31f12eec2 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 3 Aug 2017 15:55:09 -0400 Subject: [PATCH 4/4] just use Vector{UInt8} instead of raw pointer loads, since it's not clear we care about performance here --- src/latex.jl | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/src/latex.jl b/src/latex.jl index 9dd7f59..984d7d8 100644 --- a/src/latex.jl +++ b/src/latex.jl @@ -30,14 +30,14 @@ from the `arg`. function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1) e = sizeof(s) 0 < istart ≤ e || return 0,0,0 - p = pointer(s) + p = Vector{UInt8}(s) i = istart allspaces=true # find \foo directive or {...}: c = UInt8(0) while i ≤ e - c = unsafe_load(p, i) + c = p[i] (c == BACKSLASH || c == BRACEOPEN || c == CARET || c == UNDERSCORE) && break c != SPACE && (allspaces = false) i += 1 @@ -47,8 +47,8 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 if c == BACKSLASH i += 2 i-1 > e && return 0,0,0 - if isalpha8(unsafe_load(p, i-1)) - while i ≤ e && isalpha8(unsafe_load(p, i)) + if isalpha8(p[i-1]) + while i ≤ e && isalpha8(p[i]) i += 1 end end @@ -59,19 +59,19 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 end # look for optional opening brace - while i ≤ e && unsafe_load(p, i) == SPACE + while i ≤ e && p[i] == SPACE i += 1 end i > e && return directive_start, directive_end, e - inbrace = unsafe_load(p, i) == BRACEOPEN + inbrace = p[i] == BRACEOPEN if !inbrace # search backwards from \foo to look for { \foo ...} j = directive_start - 1 - while j ≥ istart && unsafe_load(p, j) == SPACE + while j ≥ istart && p[j] == SPACE j -= 1 end - if j < istart || unsafe_load(p, j) != BRACEOPEN - if unsafe_load(p, i) == BACKSLASH + if j < istart || p[j] != BRACEOPEN + if p[i] == BACKSLASH # argument is another latex directive ds,de,ae = search_latexdirective(s, i) return directive_start, directive_end, ae @@ -86,7 +86,7 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 return directive_start, directive_end, e else # argument is not in braces … get next token - while i ≤ e && isalnum8(unsafe_load(p, i)) + while i ≤ e && isalnum8(p[i]) i += 1 end return directive_start, directive_end, i-1 @@ -106,7 +106,7 @@ function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1 # search for end of argument (closing brace) nbraces = 1 while i ≤ e - c = unsafe_load(p, i) + c = p[i] if c == BRACEOPEN nbraces += 1 elseif c == BRACECLOSE @@ -127,22 +127,23 @@ Return the substring of `s` corresponding to the argument from `argstart:argend` leading/trailing whitespace and braces. """ function striparg(s::Union{String,SubString{String}}, argstart::Int=start(s), argend::Int=endof(s)) + argstart > argend && return SubString(s, 1, 0) e = endof(s) (1 ≤ argstart ≤ e && 1 ≤ argend ≤ e) || throw(BoundsError()) - p = pointer(s) - if unsafe_load(p, argend) == BRACECLOSE + p = Vector{UInt8}(s) + if p[argend] == BRACECLOSE argend -= 1 # omit brace - while argstart ≤ argend && unsafe_load(p, argstart) != BRACEOPEN + while argstart ≤ argend && p[argstart] != BRACEOPEN argstart += 1 end argstart > argend && error("malformed argument") argstart += 1 # omit brace end - while argstart ≤ argend && unsafe_load(p, argend) == SPACE + while argstart ≤ argend && p[argend] == SPACE argend -= 1 end - while argstart ≤ argend && unsafe_load(p, argstart) == SPACE + while argstart ≤ argend && p[argstart] == SPACE argstart += 1 end return SubString(s, argstart, argend) @@ -239,7 +240,7 @@ const superscripts = Dict{Char,Char}( 'A'=>'ᴬ', 'B'=>'ᴮ', 'C'=>'ᶜ', 'D'=>'ᴰ', 'E'=>'ᴱ', 'G'=>'ᴳ', 'H'=>'ᴴ', 'I'=>'ᴵ', 'J'=>'ᴶ', 'K'=>'ᴷ', 'L'=>'ᴸ', 'M'=>'ᴹ', 'N'=>'ᴺ', 'O'=>'ᴼ', 'P'=>'ᴾ', 'R'=>'ᴿ', 'S'=>'ˢ', 'T'=>'ᵀ', 'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ', - '+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', + '+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', '∘'=>'°', ) const subscripts = Dict{Char,Char}( '0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉', @@ -296,10 +297,10 @@ replace_directives(s::AbstractString, extra_directives::Associative{String,Strin # strip unescaped $ signs from s function strip_dollars(s::Union{String,SubString{String}}) buf = IOBuffer() - p = pointer(s) + p = Vector{UInt8}(s) for i = 1:sizeof(s) - c = unsafe_load(p, i) - if c == BACKSLASH && i < sizeof(s) && unsafe_load(p, i+1) == DOLLAR + c = p[i] + if c == BACKSLASH && i < sizeof(s) && p[i+1] == DOLLAR write(buf, DOLLAR) # \$ -> $ elseif c != DOLLAR write(buf, c)