From 9c8c04e950184ae668fb67c9ea9c29e0e850bcba Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 2 Aug 2017 12:44:40 -0400 Subject: [PATCH] initial attempt at latex substitutions --- src/BibTeX.jl | 1 + src/latex.jl | 108 +++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 6 +++ 3 files changed, 115 insertions(+) create mode 100644 src/latex.jl diff --git a/src/BibTeX.jl b/src/BibTeX.jl index b17118d..5827123 100644 --- a/src/BibTeX.jl +++ b/src/BibTeX.jl @@ -4,5 +4,6 @@ export Bibliography, Citation include("parser.jl") include("citation.jl") include("bibliography.jl") +include("latex.jl") end diff --git a/src/latex.jl b/src/latex.jl new file mode 100644 index 0000000..9459448 --- /dev/null +++ b/src/latex.jl @@ -0,0 +1,108 @@ +# conversion of LaTeX directives to plain text, markdown, etc. +# +# The basic idea is that we search for `\foo{arg}`, `{\foo arg}`, +# or `{\foo{arg}}`, and look up `foo` in a dictionary of substitutions +# like `textit` -> `*#1*` where #1 is where the (first) argument is +# substituted. Then we have separate dictionary entries for text/plain, +# text/markdown, etcetera. + +# regex matching (directive,arg) +const latex_directive = r"\\(\W|[A-Za-z]+) *\{([^}]*)\}|\{ *\\(\W|[A-Za-z]+) *(\{([^}]*)\}|[^}]*)\}|\\(\W|[A-Za-z]+) *(\w*)" + +# given a match m to latex_directive, return (directive,arg) +function extract_directive(m::RegexMatch) + m.captures[1] !== nothing && return (m.captures[1], m.captures[2]) + m.captures[3] !== nothing && return (m.captures[3], m.captures[5] === nothing ? m.captures[4] : m.captures[5]) + m.captures[6] !== nothing && return (m.captures[6], m.captures[7]) + throw(ArgumentError("unknown latex_directive match")) +end + +# Unicode substitutions for LaTeX directives +const latex_unicode = Dict{String,String}( + # accent escapes like `\"u` for `ü`, from the list at + # https://en.wikibooks.org/wiki/LaTeX/Special_Characters + # converted to LaTeX characters (mostly combining marks) + "`" => "#1\u0300", + "'" => "#1\u0301", + "^" => "#1\u0302", + "\"" => "#1\u0308", + "H" => "#1\u030b", + "~" => "#1\u0303", + "c" => "#1\u0327", + "k" => "#1\u0328", + "l" => "\u0142#1", + "=" => "#1\u0304", + "b" => "#1\u0331", + "." => "#1\u0307", + "d" => "#1\u0323", + "r" => "#1\u030a", + "u" => "#1\u0306", + "v" => "#1\u030c", + "t" => "#1\u0361", # fixme: u+0361 should go after first char in #1 + "o" => "\u00f8#1", + "i" => "\u0131#1", + "j" => "\u0237#1", + + # many other substitutions can be found in + # Base.REPLCompletions.latex_symbols +) + +# LaTeX directives converted to Markdown +const md_directives = Dict{String,String}( + "emph" => "_#1_", + "textit" => "_#1_", + "it" => "_#1_", + "textbf" => "**#1**", + "bf" => "**#1**", + "texttt" => "`#1`", + "url" => "[#1](#1)", + "sout" => "~~#1~~", + "st" => "~~#1~~", + "cancel" => "~~#1~~", +) + +# directives that are stripped when converting +# to text/plain +const text_directives = Dict{String,String}( + "emph" => "#1", + "textit" => "#1", + "it" => "#1", + "textbf" => "#1", + "bf" => "#1", + "texttt" => "#1", + "url" => "#1", + "sout" => "#1", + "st" => "#1", + "cancel" => "#1", +) + +# Given a string `s` that matches the latex_directive regex, +# return a new string to replace it with. We perform substitutions +# based on extra_directives as well as on latex_unicode, from above. +function directive_substitution(s::AbstractString, extra_directives::Associative{String,String}) + m = match(latex_directive, s) + m === nothing && return s + directive, arg_ = extract_directive(m) + arg = replace_directives(arg_, extra_directives) # recursively replace in args + if haskey(extra_directives, directive) + return replace(extra_directives[directive], "#1", arg) + elseif haskey(latex_unicode, directive) + return replace(latex_unicode[directive], "#1", arg) + else + bdir = string('\\', directive) + if haskey(Base.REPLCompletions.latex_symbols, bdir) + sym = Base.REPLCompletions.latex_symbols[bdir] + if isempty(sym) || strwidth(sym) > 0 + return string(sym, arg) + else + return string(arg, sym) # combining character like \hat{x} + end + else + return s # no substitutions found + end + end +end + +# replace all latex directives in `s` via `directive_substitution` +replace_directives(s::AbstractString, extra_directives::Associative{String,String}) = + replace(s, latex_directive, sub -> directive_substitution(sub, extra_directives)) diff --git a/test/runtests.jl b/test/runtests.jl index 0831462..944fc38 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -47,3 +47,9 @@ end @test isempty(similar(x2)) @test isempty(sizehint!(empty!(x2),10)) end + +import BibTeX: replace_directives, md_directives +@testset "latex" begin + @test replace_directives(raw"foo \emph{bar} {\bf baz} {\^{u}}", md_directives) == + "foo _bar_ **baz** û" +end