diff --git a/src/latex.jl b/src/latex.jl index 984d7d8..c86e349 100644 --- a/src/latex.jl +++ b/src/latex.jl @@ -1,7 +1,7 @@ # conversion of LaTeX directives to plain text, markdown, etc. # -# The basic idea is that we search for `\foo{arg}`, `{\foo arg}`, -# or `{\foo{arg}}`, and look up `foo` in a dictionary of substitutions +# The basic idea is that we search for `\foo{argument}`, `{\foo argument}`, +# or `{\foo{argument}}`, and look up `foo` in a dictionary of substitutions # like `\textit` -> `*#1*` where #1 is where the (first) argument is # substituted. Then we have separate dictionary entries for text/plain, # text/markdown, etcetera. @@ -10,158 +10,185 @@ # parsing LaTeX directives: const BACKSLASH = UInt8('\\') -const BRACEOPEN = UInt8('{') -const BRACECLOSE = UInt8('}') +const BRACE_OPEN = UInt8('{') +const BRACE_CLOSE = UInt8('}') const SPACE = UInt8(' ') const DOLLAR = UInt8('$') const CARET = UInt8('^') const UNDERSCORE = UInt8('_') -isalpha8(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z') -isalnum8(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || isalpha8(x) +is_letter(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z') +is_alphanumeric(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || is_letter(x) """ - search_latexdirective(string, istart=1, inbrace=false) + search_latex_directive(astring, start_position = 1, inbrace=false) -Search for a LaTeX directive \\directive{arg} or similar in `string`, returning -`(ds, de, ae)` such that `string[ds:de]` gives `\\directive` and `string[de+1:ae]` -gives `{arg}`. Use [`striparg`](@ref) to remove surrounding braces and whitespace -from the `arg`. +Search for a LaTeX directive \\directive{argument} or similar in `string`, returning +`(start_position, directive_end, argument_end)` such that `string[start_position:directive_end]` gives `\\directive` and `string[directive_end+1:argument_end]` +gives `{argument}`. Use [`strip_argument`](@ref) to remove surrounding braces and whitespace +from the `argument`. """ -function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1) - e = sizeof(s) - 0 < istart ≤ e || return 0,0,0 - p = Vector{UInt8}(s) - i = istart - allspaces=true +function search_latex_directive(astring, start_position = 1) + string_length = sizeof(astring) + if !(0 < start_position ≤ string_length) + 0, 0, 0 + else + character_vector = Vector{UInt8}(astring) + index = start_position + all_spaces = true - # find \foo directive or {...}: - c = UInt8(0) - while i ≤ e - c = p[i] - (c == BACKSLASH || c == BRACEOPEN || c == CARET || c == UNDERSCORE) && break - c != SPACE && (allspaces = false) - i += 1 - end - if i ≤ e && c != BRACEOPEN - directive_start = i - if c == BACKSLASH - i += 2 - i-1 > e && return 0,0,0 - if isalpha8(p[i-1]) - while i ≤ e && isalpha8(p[i]) - i += 1 + # find \foo directive or {...}: + character = UInt8(0) + while index ≤ string_length + character = character_vector[index] + if (character == BACKSLASH || character == BRACE_OPEN || character == CARET || character == UNDERSCORE) + break + end + if character != SPACE + all_spaces = false + end + index += 1 + end + if index ≤ string_length && character != BRACE_OPEN + directive_start = index + if character == BACKSLASH + index += 2 + if index - 1 > string_length + return 0,0,0 end - end - directive_end = i-1 - else - directive_end = directive_start # ^ or _ - i += 1 - end - - # look for optional opening brace - while i ≤ e && p[i] == SPACE - i += 1 - end - i > e && return directive_start, directive_end, e - inbrace = p[i] == BRACEOPEN - if !inbrace - # search backwards from \foo to look for { \foo ...} - j = directive_start - 1 - while j ≥ istart && p[j] == SPACE - j -= 1 - end - if j < istart || p[j] != BRACEOPEN - if p[i] == BACKSLASH - # argument is another latex directive - ds,de,ae = search_latexdirective(s, i) - return directive_start, directive_end, ae - elseif c != BACKSLASH - # in an equation, token is a single char - return directive_start, directive_end, i - elseif allspaces - # if `\directive ...` was preceded only - # by whitespace, then assume arguments - # extend to the end of the string. This - # happens when we recurse on `{\directive ...}`. - return directive_start, directive_end, e - else - # argument is not in braces … get next token - while i ≤ e && isalnum8(p[i]) - i += 1 + if is_letter(character_vector[index - 1]) + while index ≤ string_length && is_letter(character_vector[index]) + index += 1 + end + end + directive_end = index - 1 + else + directive_end = directive_start # ^ or _ + index += 1 + end + + # look for optional opening brace + while index ≤ string_length && character_vector[index] == SPACE + index += 1 + end + if index > string_length + return directive_start, directive_end, string_length + end + in_braces = character_vector[index] == BRACE_OPEN + if !in_braces + # search backwards from \foo to look for { \foo ...} + backwards_index = directive_start - 1 + while backwards_index ≥ start_position && character_vector[backwards_index] == SPACE + backwards_index -= 1 + end + if backwards_index < start_position || character_vector[backwards_index] != BRACE_OPEN + if character_vector[index] == BACKSLASH + # argument is another latex directive + inner_start_position, inner_directive_end, inner_argument_end = search_latex_directive(astring, index) + return directive_start, directive_end, inner_argument_end + elseif character != BACKSLASH + # in an equation, token is a single char + return directive_start, directive_end, index + elseif all_spaces + # if `\directive ...` was preceded only + # by whitespace, then assume arguments + # extend to the end of the string. This + # happens when we recurse on `{\directive ...}`. + return directive_start, directive_end, string_length + else + # argument is not in braces … get next token + while index ≤ string_length && is_alphanumeric(character_vector[index]) + index += 1 + end + return directive_start, directive_end, index - 1 end - return directive_start, directive_end, i-1 end end + index += 1 + elseif index > string_length + return 0, 0, 0 + else # { ... } + directive_start = index + directive_end = index - 1 + in_braces = true + index += 1 end - i += 1 - elseif i > e - return 0,0,0 - else # { ... } - directive_start = i - directive_end = i - 1 - inbrace = true - i += 1 - end - # search for end of argument (closing brace) - nbraces = 1 - while i ≤ e - c = p[i] - if c == BRACEOPEN - nbraces += 1 - elseif c == BRACECLOSE - nbraces -= 1 - if nbraces == 0 - return directive_start, directive_end, inbrace ? i : i-1 + # search for end of argument (closing brace) + number_of_braces = 1 + while index ≤ string_length + character = character_vector[index] + if character == BRACE_OPEN + number_of_braces += 1 + elseif character == BRACE_CLOSE + number_of_braces -= 1 + if number_of_braces == 0 + argument_end = if in_braces + index + else + index - 1 + end + return directive_start, directive_end, argument_end + end end + index += 1 end - i += 1 + directive_start, directive_end, string_length end - return directive_start, directive_end, e end """ - striparg(s, argstart=start(s), argend=endof(s)) + strip_argument(astring, start_position = start(astring), end_position = endof(astring)) -Return the substring of `s` corresponding to the argument from `argstart:argend`, stripping +Return the substring of `astring` corresponding to the argument from `start_position:end_position`, stripping leading/trailing whitespace and braces. """ -function striparg(s::Union{String,SubString{String}}, argstart::Int=start(s), argend::Int=endof(s)) - argstart > argend && return SubString(s, 1, 0) - e = endof(s) - (1 ≤ argstart ≤ e && 1 ≤ argend ≤ e) || throw(BoundsError()) - - p = Vector{UInt8}(s) - if p[argend] == BRACECLOSE - argend -= 1 # omit brace - while argstart ≤ argend && p[argstart] != BRACEOPEN - argstart += 1 +function strip_argument(astring, start_position = start(astring), end_position = endof(astring)) + if start_position > end_position + SubString(astring, 1, 0) + else + string_length = endof(astring) + if !(1 ≤ start_position ≤ string_length && 1 ≤ end_position ≤ string_length) + throw(BoundsError()) + else + character_vector = Vector{UInt8}(astring) + if character_vector[end_position] == BRACE_CLOSE + end_position -= 1 # omit brace + while start_position ≤ end_position && character_vector[start_position] != BRACE_OPEN + start_position += 1 + end + if start_position > end_position + error("malformed argument") + end + start_position += 1 # omit brace + end + while start_position ≤ end_position && character_vector[end_position] == SPACE + end_position -= 1 + end + while start_position ≤ end_position && character_vector[start_position] == SPACE + start_position += 1 + end + SubString(astring, start_position, end_position) end - argstart > argend && error("malformed argument") - argstart += 1 # omit brace end - while argstart ≤ argend && p[argend] == SPACE - argend -= 1 - end - while argstart ≤ argend && p[argstart] == SPACE - argstart += 1 - end - return SubString(s, argstart, argend) end # to make replace work for LaTeX directives with our # custom search function, all we need to do is to define -# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, i) +# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, index) # returns the range of the directive struct LaTeXDirectiveSearch; end -function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, i::Integer) - ds, de, ae = search_latexdirective(s, i) - return ds < i ? (0:-1) : (ds:ae) +function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, index) + start_position, directive_end, argument_end = search_latex_directive(s, index) + if start_position < index + 0:-1 + else + start_position:argument_end + end end ########################################################################### # Unicode substitutions for LaTeX directives -const latex_unicode = Dict{String,String}( +const latex_unicode = Dict( # accent escapes like `\"u` for `ü`, from the list at # https://en.wikibooks.org/wiki/LaTeX/Special_Characters # converted to LaTeX characters (mostly combining marks) @@ -200,7 +227,7 @@ const latex_unicode = Dict{String,String}( ) # LaTeX directives converted to Markdown -const md_directives = Dict{String,String}( +const markdown_directives = Dict( "\\emph" => "_#1_", "\\textit" => "_#1_", "\\it" => "_#1_", @@ -218,7 +245,7 @@ const md_directives = Dict{String,String}( # directives that are stripped when converting # to text/plain -const text_directives = Dict{String,String}( +const text_directives = Dict( "\\emph" => "#1", "\\textit" => "#1", "\\it" => "#1", @@ -232,7 +259,7 @@ const text_directives = Dict{String,String}( ) # Unicode includes an incomplete set of super/subscript characters: -const superscripts = Dict{Char,Char}( +const superscripts = Dict( '0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹', 'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ', 'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ', @@ -242,83 +269,93 @@ const superscripts = Dict{Char,Char}( 'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ', '+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', '∘'=>'°', ) -const subscripts = Dict{Char,Char}( +const subscripts = Dict( '0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉', 'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ', 'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ', 'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ', '-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ', ) -function replacechars(s::AbstractString, charmap::Associative{Char,Char}) - buf = IOBuffer() - for c in s - cm = get(charmap, c, '\0') - cm == '\0' && return "" - print(buf, cm) + +function replace_characters(astring, character_map) + buffer = IOBuffer() + for character in astring + mapped_character = get(character_map, character, '\0') + if mapped_character == '\0' + return "" + end + print(buffer, mapped_character) end - return String(take!(buf)) + String(take!(buffer)) end # Given a (sub)string `s` that represents a LaTeX directive matched -# by search_latexdirective, performs our Unicode substitutions and +# by search_latex_directive, performs our Unicode substitutions and # also any additional substitutions given by extra_directives. -function directive_substitution(s::AbstractString, extra_directives::Associative{String,String}) - ds, de = search_latexdirective(s) - ae = endof(s) - directive = SubString(s, ds, de) +function directive_substitution(astring, extra_directives) + start_position, directive_end, argument_end = search_latex_directive(astring) + string_length = endof(astring) + directive = SubString(astring, start_position, directive_end) for dict in (extra_directives, latex_unicode, Base.REPLCompletions.latex_symbols) if haskey(dict, directive) - sub = dict[directive] - if contains(sub, "#1") - arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives)) - return replace(sub, "#1", arg) + substitution = dict[directive] + if contains(substitution, "#1") + argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives)) + return replace(substitution, "#1", argument) else - arg = replace_directives(SubString(s, de+1, ae), extra_directives) - if strwidth(sub) == 0 # \hat{...} etc: combining chars go after argument - return string(striparg(arg), sub) + argument = replace_directives(SubString(astring, directive_end+1, string_length), extra_directives) + if strwidth(substitution) == 0 # \hat{...} etc: combining chars go after argument + return string(strip_argument(argument), substitution) else - return string(sub, arg) # don't strip for 0-arg macros + return string(substitution, argument) # don't strip for 0-arg macros end end end end if directive == "^" || directive == "_" # super/subscripts - arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives)) - sarg = replacechars(arg, directive == "^" ? superscripts : subscripts) - !isempty(sarg) && return sarg + argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives)) + dict = if directive == "^" + superscripts + else + subscripts + end + substitution = replace_characters(argument, dict) + if !isempty(substitution) + return substitution + end end - return s # ignore unrecognized directives + astring # ignore unrecognized directives end # replace all latex directives in `s` via `directive_substitution` -replace_directives(s::AbstractString, extra_directives::Associative{String,String}) = - replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives)) +replace_directives(astring, extra_directives) = + replace(astring, LaTeXDirectiveSearch(), substitution -> directive_substitution(substitution, extra_directives)) # strip unescaped $ signs from s -function strip_dollars(s::Union{String,SubString{String}}) - buf = IOBuffer() - p = Vector{UInt8}(s) - for i = 1:sizeof(s) - c = p[i] - if c == BACKSLASH && i < sizeof(s) && p[i+1] == DOLLAR - write(buf, DOLLAR) # \$ -> $ - elseif c != DOLLAR - write(buf, c) +function strip_dollars(astring) + buffer = IOBuffer() + character_vector = Vector{UInt8}(astring) + for index = 1:sizeof(astring) + character = character_vector[index] + if character == BACKSLASH && index < sizeof(astring) && character_vector[index + 1] == DOLLAR + write(buffer, DOLLAR) # \$ -> $ + elseif character != DOLLAR + write(buffer, character) end end - return String(take!(buf)) + return String(take!(buffer)) end """ - simplify_latex(s::AbstractString, extra_directives=BibTeX.text_directives) + simplify_latex(astring, extra_directives) -Simplify a LaTeX string `s` into "plain text" if possible, stripping/converting +Simplify a LaTeX string `astring` into "plain text" if possible, stripping/converting known LaTeX directives in favor of e.g Unicode. `extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives to replacements. It defaults to `BibTeX.text_directives`, which simply strips -out things like bold and italics. Alternatively, you can pass `BibTeX.md_directives`, +out things like bold and italics. Alternatively, you can pass `BibTeX.markdown_directives`, which uses Markdown syntax for such directives. """ -simplify_latex(s::AbstractString, extra_directives::Associative{String,String}=text_directives) = - strip_dollars(replace_directives(s, extra_directives)) +simplify_latex(astring, extra_directives = text_directives) = + strip_dollars(replace_directives(astring, extra_directives)) diff --git a/src/parser.jl b/src/parser.jl index 76e0132..d710f34 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -18,6 +18,8 @@ parse_text(text) = matchall(r"[^\s\"#{}@,=\\]+|\s+|\"|#|{|}|@|,|=|\\", text) |> location(parser) = "on line $(parser.line)" +Base.isempty(p::Parser) = isempty(p.tokens) + next_token_default!(parser) = if isempty(parser.tokens) one(parser) @@ -49,14 +51,15 @@ next_token!(parser, eol = "additional tokens") = begin end end -expect(parser, result, expectation) = - if result != expectation - error("Expected $expectation $(location(parser))") +expect(parser, result, eol) = + if result != eol + error("Expected $eol $(location(parser))") end -expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation) +expect!(parser, eol) = + expect(parser, next_token!(parser, eol), eol) -token_and_counter!(parser, eol) = begin +token_and_counter!(parser, eol = "}") = begin token = next_token_with_space!(parser, eol) if token == "{" parser.bracket_counter += 1 @@ -80,10 +83,10 @@ value!(parser, values = eltype(parser)[]) = begin end elseif token == "{" parser.bracket_counter += 1 - token = token_and_counter!(parser, "}") + token = token_and_counter!(parser) while parser.bracket_counter > 0 push!(values, token) - token = token_and_counter!(parser, "}") + token = token_and_counter!(parser) end else push!(values, getkey(parser.substitutions, token, String(token) ) ) diff --git a/test/runtests.jl b/test/runtests.jl index 0904837..fc391a3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,11 +1,13 @@ using BibTeX, Base.Test +base_file = dirname(dirname(@__FILE__)) + import Documenter Documenter.makedocs( modules = [BibTeX], format = :html, sitename = "BibTeX.jl", - root = joinpath(dirname(dirname(@__FILE__)), "docs"), + root = joinpath(base_file, "docs"), pages = Any["Home" => "index.md"], strict = true, linkcheck = true, @@ -14,7 +16,8 @@ Documenter.makedocs( ) @testset "examples.bib" begin - b = open(Bibliography, joinpath("..", "example", "examples.bib"), "r") + # note: ".." does not work on windows + b = open(Bibliography, joinpath(base_file, "example", "examples.bib"), "r") @test length(b) == 92 @test (b["angenendt"]::Citation{:article})["date"] == "2002" end @@ -48,8 +51,8 @@ end @test isempty(sizehint!(empty!(x2),10)) end -import BibTeX: simplify_latex, md_directives +import BibTeX: simplify_latex, markdown_directives @testset "latex" begin - @test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) == + @test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", markdown_directives) == "foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û" end