cosmetic changes: word variables, avoiding short circuits, less returns

Merge pull request #6 from bramtayl/latex
WIP: latex substitutions
2017-08-06 14:34:42 -04:00 · 2017-08-05 11:52:26 -04:00 · 2017-08-03 15:55:09 -04:00 · 2017-08-03 12:16:31 -04:00 · 2017-08-03 09:40:14 -04:00 · 2017-08-03 09:17:14 -04:00
19 changed files with 2532 additions and 0 deletions
--- a/.codecov
+++ b/.codecov
@ -0,0 +1 @@
+comment: false
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+*.jl.cov
+*.jl.*.cov
+*.jl.mem
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,14 @@
+# Documentation: http://docs.travis-ci.com/user/languages/julia/
+language: julia
+os:
+  - linux
+julia:
+  - 0.6
+  - nightly
+notifications:
+  email: false
+after_success:
+# build documentation
+  - julia -e 'cd(Pkg.dir("BibTeX")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))'
+# push coverage results to Codecov
+  - julia -e 'cd(Pkg.dir("BibTeX")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
--- a/LICENSE.md
+++ b/LICENSE.md
@ -0,0 +1,41 @@
+The BibTeX.jl package is licensed under the MIT "Expat" License:
+
+
+> Copyright (c) 2017: Brandon Taylor.
+>
+>
+> Permission is hereby granted, free of charge, to any person obtaining a copy
+>
+> of this software and associated documentation files (the "Software"), to deal
+>
+> in the Software without restriction, including without limitation the rights
+>
+> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+>
+> copies of the Software, and to permit persons to whom the Software is
+>
+> furnished to do so, subject to the following conditions:
+>
+>
+>
+> The above copyright notice and this permission notice shall be included in all
+>
+> copies or substantial portions of the Software.
+>
+>
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+>
+> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+>
+> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+>
+> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+>
+> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+>
+> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+>
+> SOFTWARE.
+>
+>
--- a/README.md
+++ b/README.md
@ -0,0 +1,20 @@
+# BibTeX
+
+[![travis badge][travis_badge]][travis_url]
+[![codecov badge][codecov_badge]][codecov_url]
+
+## Documentation [here][documenter_latest]
+
+Change documentation link to `documenter_stable` once published!
+
+[travis_badge]: https://travis-ci.org/bramtayl/BibTeX.jl.svg?branch=master
+[travis_url]: https://travis-ci.org/bramtayl/BibTeX.jl
+
+[appveyor_badge]: https://ci.appveyor.com/api/projects/status/github/bramtayl/BibTeX.jl?svg=true&branch=master
+[appveyor_url]: https://ci.appveyor.com/project/bramtayl/bibtex-jl
+
+[codecov_badge]: http://codecov.io/github/bramtayl/BibTeX.jl/coverage.svg?branch=master
+[codecov_url]: http://codecov.io/github/bramtayl/BibTeX.jl?branch=master
+
+[documenter_stable]: https://bramtayl.github.io/BibTeX.jl/stable
+[documenter_latest]: https://bramtayl.github.io/BibTeX.jl/latest
--- a/1
+++ b/1
@ -0,0 +1 @@
+julia 0.6
--- a/appveyor.yml
+++ b/appveyor.yml
@ -0,0 +1,26 @@
+environment:
+  matrix:
+  - JULIAVERSION: "julialang/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
+  - JULIAVERSION: "julialang/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
+  - JULIAVERSION: "julianightlies/bin/winnt/x86/julia-latest-win32.exe"
+  - JULIAVERSION: "julianightlies/bin/winnt/x64/julia-latest-win64.exe"
+branches:
+  only:
+    - master
+    - /release-.*/
+notifications:
+  - provider: Email
+    on_build_success: false
+    on_build_failure: false
+    on_build_status_changed: false
+install:
+  - ps: (new-object net.webclient).DownloadFile(
+        $("http://s3.amazonaws.com/"+$env:JULIAVERSION),
+        "C:\projects\julia-binary.exe")
+  - C:\projects\julia-binary.exe /S /D=C:\projects\julia
+build_script:
+  - IF EXIST .git\shallow (git fetch --unshallow)
+  - C:\projects\julia\bin\julia -e "versioninfo();
+      Pkg.clone(pwd(), \"BibTeX\"); Pkg.build(\"BibTeX\")"
+test_script:
+  - C:\projects\julia\bin\julia -e "Pkg.test(\"BibTeX\")"
--- a/docs/.gitignore
+++ b/docs/.gitignore
@ -0,0 +1,2 @@
+build/
+site/
--- a/docs/make.jl
+++ b/docs/make.jl
@ -0,0 +1,8 @@
+import Documenter
+
+Documenter.deploydocs(
+    repo = "github.com/bramtayl/BibTeX.jl.git",
+    target = "build",
+    deps = nothing,
+    make = nothing
+)
--- a/docs/src/index.md
+++ b/docs/src/index.md
@ -0,0 +1,8 @@
+# BibTeX.jl
+
+```@index
+```
+
+```@autodocs
+Modules = [BibTeX]
+```
--- a/example/examples.bib
+++ b/example/examples.bib
--- a/src/BibTeX.jl
+++ b/src/BibTeX.jl
@ -0,0 +1,9 @@
+module BibTeX
+export Bibliography, Citation
+
+include("parser.jl")
+include("citation.jl")
+include("bibliography.jl")
+include("latex.jl")
+
+end
--- a/src/bibliography.jl
+++ b/src/bibliography.jl
@ -0,0 +1,39 @@
+struct Bibliography <: Associative{String,Citation}
+    preamble::String
+    data::Dict{String,Citation}
+end
+
+"""
+    Bibliography(bibtex::String)
+    Bibliography(io::IO)
+
+Given a string (or IO stream) of bibtex-format bibliography data,
+parses the data and returns a `Dict`-like object `b::Bibliography` that
+behaves as a dictionary mapping strings to bibliography items
+[`Citation`](@ref).
+"""
+function Bibliography(bibtex::String)
+    preamble, data = parse_bibtex(bibtex)
+    return Bibliography(preamble, Dict(k=>Citation!(v) for (k,v) in data))
+end
+Bibliography(io::IO) = Bibliography(readstring(io))
+Base.open(::Type{Bibliography}, args...) = open(io -> Bibliography(io), args...)
+
+Base.similar(b::Bibliography) = Bibliography("", Dict{String,Citation}())
+Base.rehash!(b::Bibliography, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end
+Base.sizehint!(b::Bibliography, n) = begin sizehint!(b.data, n); b; end
+Base.empty!(b::Bibliography) = begin empty!(b.data); b; end
+Base.copy(b::Bibliography) = Bibliography(b.preamble, copy(b.data))
+
+function Base.setindex!(b::Bibliography, v::Citation, k::AbstractString)
+    b.data[String(k)] = v
+    return b
+end
+Base.get(b::Bibliography, k::AbstractString, default) = get(b.data, String(k), default)
+
+Base.start(b::Bibliography) = start(b.data)
+Base.done(b::Bibliography, i) = done(b.data, i)
+Base.next(b::Bibliography, i) = next(b.data, i)
+Base.length(b::Bibliography) = length(b.data)
+
+# todo: add specialized Base.show methods for MIME"text/bibtex" etc.
--- a/src/citation.jl
+++ b/src/citation.jl
@ -0,0 +1,42 @@
+"""
+    Citation{S}(data::Dict{String,String})
+
+A bibliography item in a bibTeX database, based on a dictionary of
+strings to values.  It is parameterized by a symbol `S` giving the
+type of the item (`:article` etcetera).  A `b::Citation` supports
+`b[key]` access to retrieve the data and in general acts like
+a dictionary from `String` to `String`.
+"""
+struct Citation{S} <: Associative{String,String}
+    data::Dict{String,String}
+end
+Citation{S}() where {S} = Citation{S}(Dict{String,String}())
+
+function Citation!(data::Dict{String,String})
+    S = Symbol(pop!(data, "__type__"))
+    return Citation{S}(data)
+end
+
+Base.similar(b::Citation{S}) where {S} = Citation{S}(Dict{String,String}())
+Base.rehash!(b::Citation, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end
+Base.sizehint!(b::Citation, n) = begin sizehint!(b.data, n); b; end
+Base.empty!(b::Citation) = begin empty!(b.data); b; end
+Base.copy(b::Citation{S}) where {S} = Citation{S}(copy(b.data))
+
+Base.get(b::Citation, k::AbstractString, default) = get(b.data, String(k), default)
+Base.getindex(b::Citation, k::AbstractString) = getindex(b.data, String(k))
+function Base.setindex!(b::Citation, v::AbstractString, k::AbstractString)
+    b.data[String(k)] = String(v)
+    return b
+end
+
+Base.start(b::Citation) = start(b.data)
+Base.done(b::Citation, i) = done(b.data, i)
+Base.next(b::Citation, i) = next(b.data, i)
+Base.length(b::Citation) = length(b.data)
+
+function Base.show{S}(io::IO, b::Citation{S})
+    print(io, "Citation{:$S}(", length(b), " entries)")
+end
+
+# TODO: add Base.show text/plain and text/markdown for formatted citation
--- a/src/latex.jl
+++ b/src/latex.jl
@ -0,0 +1,361 @@
+# conversion of LaTeX directives to plain text, markdown, etc.
+#
+# The basic idea is that we search for `\foo{argument}`, `{\foo argument}`,
+# or `{\foo{argument}}`, and look up `foo` in a dictionary of substitutions
+# like `\textit` -> `*#1*` where #1 is where the (first) argument is
+# substituted.  Then we have separate dictionary entries for text/plain,
+# text/markdown, etcetera.
+
+###########################################################################
+# parsing LaTeX directives:
+
+const BACKSLASH = UInt8('\\')
+const BRACE_OPEN = UInt8('{')
+const BRACE_CLOSE = UInt8('}')
+const SPACE = UInt8(' ')
+const DOLLAR = UInt8('$')
+const CARET = UInt8('^')
+const UNDERSCORE = UInt8('_')
+is_letter(x::UInt8) = UInt8('a') ≤ x ≤ UInt8('z') || UInt8('A') ≤ x ≤ UInt8('Z')
+is_alphanumeric(x::UInt8) = UInt8('0') ≤ x ≤ UInt8('9') || is_letter(x)
+
+"""
+    search_latex_directive(astring, start_position = 1, inbrace=false)
+
+Search for a LaTeX directive \\directive{argument} or similar in `string`, returning
+`(start_position, directive_end, argument_end)` such that `string[start_position:directive_end]` gives `\\directive` and `string[directive_end+1:argument_end]`
+gives `{argument}`.   Use [`strip_argument`](@ref) to remove surrounding braces and whitespace
+from the `argument`.
+"""
+function search_latex_directive(astring, start_position = 1)
+    string_length = sizeof(astring)
+    if !(0 < start_position ≤ string_length)
+        0, 0, 0
+    else
+        character_vector = Vector{UInt8}(astring)
+        index = start_position
+        all_spaces = true
+
+        # find \foo directive or {...}:
+        character = UInt8(0)
+        while index ≤ string_length
+            character = character_vector[index]
+            if (character == BACKSLASH || character == BRACE_OPEN || character == CARET || character == UNDERSCORE)
+                break
+            end
+            if character != SPACE
+                all_spaces = false
+            end
+            index += 1
+        end
+        if index ≤ string_length && character != BRACE_OPEN
+            directive_start = index
+            if character == BACKSLASH
+                index += 2
+                if index - 1 > string_length
+                    return 0,0,0
+                end
+                if is_letter(character_vector[index - 1])
+                    while index ≤ string_length && is_letter(character_vector[index])
+                        index += 1
+                    end
+                end
+                directive_end = index - 1
+            else
+                directive_end = directive_start # ^ or _
+                index += 1
+            end
+
+            # look for optional opening brace
+            while index ≤ string_length && character_vector[index] == SPACE
+                index += 1
+            end
+            if index > string_length
+                return directive_start, directive_end, string_length
+            end
+            in_braces = character_vector[index] == BRACE_OPEN
+            if !in_braces
+                # search backwards from \foo to look for { \foo ...}
+                backwards_index = directive_start - 1
+                while backwards_index ≥ start_position && character_vector[backwards_index] == SPACE
+                    backwards_index -= 1
+                end
+                if backwards_index < start_position || character_vector[backwards_index] != BRACE_OPEN
+                    if character_vector[index] == BACKSLASH
+                        # argument is another latex directive
+                        inner_start_position, inner_directive_end, inner_argument_end = search_latex_directive(astring, index)
+                        return directive_start, directive_end, inner_argument_end
+                    elseif character != BACKSLASH
+                        # in an equation, token is a single char
+                        return directive_start, directive_end, index
+                    elseif all_spaces
+                        # if `\directive ...` was preceded only
+                        # by whitespace, then assume arguments
+                        # extend to the end of the string.  This
+                        # happens when we recurse on `{\directive ...}`.
+                        return directive_start, directive_end, string_length
+                    else
+                        # argument is not in braces … get next token
+                        while index ≤ string_length && is_alphanumeric(character_vector[index])
+                            index += 1
+                        end
+                        return directive_start, directive_end, index - 1
+                    end
+                end
+            end
+            index += 1
+        elseif index > string_length
+            return 0, 0, 0
+        else # { ... }
+            directive_start = index
+            directive_end = index - 1
+            in_braces = true
+            index += 1
+        end
+
+        # search for end of argument (closing brace)
+        number_of_braces = 1
+        while index ≤ string_length
+            character = character_vector[index]
+            if character == BRACE_OPEN
+                number_of_braces += 1
+            elseif character == BRACE_CLOSE
+                number_of_braces -= 1
+                if number_of_braces == 0
+                    argument_end = if in_braces
+                        index
+                    else
+                        index - 1
+                    end
+                    return directive_start, directive_end, argument_end
+                end
+            end
+            index += 1
+        end
+        directive_start, directive_end, string_length
+    end
+end
+
+"""
+    strip_argument(astring, start_position = start(astring), end_position = endof(astring))
+
+Return the substring of `astring` corresponding to the argument from `start_position:end_position`, stripping
+leading/trailing whitespace and braces.
+"""
+function strip_argument(astring, start_position = start(astring), end_position = endof(astring))
+    if start_position > end_position
+        SubString(astring, 1, 0)
+    else
+        string_length = endof(astring)
+        if !(1 ≤ start_position ≤ string_length && 1 ≤ end_position ≤ string_length)
+            throw(BoundsError())
+        else
+            character_vector = Vector{UInt8}(astring)
+            if character_vector[end_position] == BRACE_CLOSE
+                end_position -= 1 # omit brace
+                while start_position ≤ end_position && character_vector[start_position] != BRACE_OPEN
+                    start_position += 1
+                end
+                if start_position > end_position
+                    error("malformed argument")
+                end
+                start_position += 1 # omit brace
+            end
+            while start_position ≤ end_position && character_vector[end_position] == SPACE
+                end_position -= 1
+            end
+            while start_position ≤ end_position && character_vector[start_position] == SPACE
+                start_position += 1
+            end
+            SubString(astring, start_position, end_position)
+        end
+    end
+end
+
+# to make replace work for LaTeX directives with our
+# custom search function, all we need to do is to define
+# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, index)
+# returns the range of the directive
+struct LaTeXDirectiveSearch; end
+function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, index)
+    start_position, directive_end, argument_end = search_latex_directive(s, index)
+    if start_position < index
+        0:-1
+    else
+        start_position:argument_end
+    end
+end
+###########################################################################
+
+# Unicode substitutions for LaTeX directives
+const latex_unicode = Dict(
+    # accent escapes like `\"u` for `ü`, from the list at
+    # https://en.wikibooks.org/wiki/LaTeX/Special_Characters
+    # converted to LaTeX characters (mostly combining marks)
+    "\\`" => "#1\u0300",
+    "\\'" => "#1\u0301",
+    "\\^" => "#1\u0302",
+    "\\\"" => "#1\u0308",
+    "\\H" => "#1\u030b",
+    "\\~" => "#1\u0303",
+    "\\c" => "#1\u0327",
+    "\\k" => "#1\u0328",
+    "\\l" => "\u0142",
+    "\\=" => "#1\u0304",
+    "\\b" => "#1\u0331",
+    "\\." => "#1\u0307",
+    "\\d" => "#1\u0323",
+    "\\r" => "#1\u030a",
+    "\\u" => "#1\u0306",
+    "\\v" => "#1\u030c",
+    "\\t" => "#1\u0361", # fixme: u+0361 should go after first char in #1
+    "\\o" => "\u00f8",
+    "\\i" => "\u0131",
+    "\\j" => "\u0237",
+
+    # other backslash escapes
+    "\\\\" => "\\",
+    "\\{" => "{", "\\}" => "}",
+    "\\%" => "%",
+    # "\\\$" => "\$" -- dollar signs will be unescaped in strip_dollars
+
+    # We parse {....} quoting as an empty directive:
+    "" => "#1",
+
+    # many other substitutions can be found in
+    # Base.REPLCompletions.latex_symbols
+)
+
+# LaTeX directives converted to Markdown
+const markdown_directives = Dict(
+    "\\emph" => "_#1_",
+    "\\textit" => "_#1_",
+    "\\it" => "_#1_",
+    "\\mathit" => "_#1_",
+    "\\textbf" => "**#1**",
+    "\\bf" => "**#1**",
+    "\\mathbf" => "**#1**",
+    "\\texttt" => "`#1`",
+    "\\mathrm" => "#1",
+    "\\url" => "[#1](#1)",
+    "\\sout" => "~~#1~~",
+    "\\st" => "~~#1~~",
+    "\\cancel" => "~~#1~~",
+)
+
+# directives that are stripped when converting
+# to text/plain
+const text_directives = Dict(
+    "\\emph" => "#1",
+    "\\textit" => "#1",
+    "\\it" => "#1",
+    "\\mathit" => "#1",
+    "\\textbf" => "#1",
+    "\\bf" => "#1",
+    "\\mathbf" => "#1",
+    "\\texttt" => "#1",
+    "\\mathrm" => "#1",
+    "\\url" => "#1",
+)
+
+# Unicode includes an incomplete set of super/subscript characters:
+const superscripts = Dict(
+    '0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹',
+    'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ',
+    'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ',
+    'r'=>'ʳ', 's'=>'ˢ', 't'=>'ᵗ', 'u'=>'ᵘ', 'v'=>'ᵛ', 'w'=>'ʷ', 'x'=>'ˣ', 'y'=>'ʸ', 'z'=>'ᶻ',
+    'A'=>'ᴬ', 'B'=>'ᴮ', 'C'=>'ᶜ', 'D'=>'ᴰ', 'E'=>'ᴱ', 'G'=>'ᴳ', 'H'=>'ᴴ', 'I'=>'ᴵ', 'J'=>'ᴶ',
+    'K'=>'ᴷ', 'L'=>'ᴸ', 'M'=>'ᴹ', 'N'=>'ᴺ', 'O'=>'ᴼ', 'P'=>'ᴾ', 'R'=>'ᴿ', 'S'=>'ˢ', 'T'=>'ᵀ',
+    'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ',
+    '+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', '∘'=>'°',
+)
+const subscripts = Dict(
+    '0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉',
+    'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ',
+    'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ',
+    'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ',
+    '-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ',
+)
+
+function replace_characters(astring, character_map)
+    buffer = IOBuffer()
+    for character in astring
+        mapped_character = get(character_map, character, '\0')
+        if mapped_character == '\0'
+            return ""
+        end
+        print(buffer, mapped_character)
+    end
+    String(take!(buffer))
+end
+
+# Given a (sub)string `s` that represents a LaTeX directive matched
+# by search_latex_directive, performs our Unicode substitutions and
+# also any additional substitutions given by extra_directives.
+function directive_substitution(astring, extra_directives)
+    start_position, directive_end, argument_end = search_latex_directive(astring)
+    string_length = endof(astring)
+    directive = SubString(astring, start_position, directive_end)
+    for dict in (extra_directives, latex_unicode, Base.REPLCompletions.latex_symbols)
+        if haskey(dict, directive)
+            substitution = dict[directive]
+            if contains(substitution, "#1")
+                argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives))
+                return replace(substitution, "#1", argument)
+            else
+                argument = replace_directives(SubString(astring, directive_end+1, string_length), extra_directives)
+                if strwidth(substitution) == 0 # \hat{...} etc: combining chars go after argument
+                    return string(strip_argument(argument), substitution)
+                else
+                    return string(substitution, argument) # don't strip for 0-arg macros
+                end
+            end
+        end
+    end
+    if directive == "^" || directive == "_" # super/subscripts
+        argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives))
+        dict = if directive == "^"
+            superscripts
+        else
+            subscripts
+        end
+        substitution = replace_characters(argument, dict)
+        if !isempty(substitution)
+            return substitution
+        end
+    end
+    astring # ignore unrecognized directives
+end
+
+# replace all latex directives in `s` via `directive_substitution`
+replace_directives(astring, extra_directives) =
+    replace(astring, LaTeXDirectiveSearch(), substitution -> directive_substitution(substitution, extra_directives))
+
+# strip unescaped $ signs from s
+function strip_dollars(astring)
+    buffer = IOBuffer()
+    character_vector = Vector{UInt8}(astring)
+    for index = 1:sizeof(astring)
+        character = character_vector[index]
+        if character == BACKSLASH && index < sizeof(astring) && character_vector[index + 1] == DOLLAR
+            write(buffer, DOLLAR) # \$ -> $
+        elseif character != DOLLAR
+            write(buffer, character)
+        end
+    end
+    return String(take!(buffer))
+end
+
+"""
+    simplify_latex(astring, extra_directives)
+
+Simplify a LaTeX string `astring` into "plain text" if possible, stripping/converting
+known LaTeX directives in favor of e.g Unicode.
+
+`extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives
+to replacements.  It defaults to `BibTeX.text_directives`, which simply strips
+out things like bold and italics.  Alternatively, you can pass `BibTeX.markdown_directives`,
+which uses Markdown syntax for such directives.
+"""
+simplify_latex(astring, extra_directives = text_directives) =
+    strip_dollars(replace_directives(astring, extra_directives))
--- a/src/parser.jl
+++ b/src/parser.jl
@ -0,0 +1,218 @@
+mutable struct Parser{T}
+    tokens::T
+    substitutions::Dict{String, String}
+    records::Dict{String, Dict{String, String}}
+    line::Int
+    bracket_counter::Int
+end
+
+Base.eltype(p::Parser) = eltype(p.tokens)
+Base.one(p::Parser) = eltype(p)("")
+
+Parser(tokens::T, substitutions, records, line, bracket_counter) where T =
+    Parser{T}(tokens, substitutions, records, line, bracket_counter)
+
+Parser(tokens) = Parser(tokens, Dict{String, String}(), Dict{String, Dict{String, String}}(), 1, 0)
+
+parse_text(text) = matchall(r"[^\s\"#{}@,=\\]+|\s+|\"|#|{|}|@|,|=|\\", text) |> Parser
+
+location(parser) = "on line $(parser.line)"
+
+Base.isempty(p::Parser) = isempty(p.tokens)
+
+next_token_default!(parser) =
+    if isempty(parser.tokens)
+        one(parser)
+    else
+        result = shift!(parser.tokens)
+        parser.line = parser.line + count(x -> x == '\n', result)
+        if all(isspace, result)
+            eltype(parser)(" ")
+        else
+            result
+        end
+    end
+
+next_token_with_space!(parser, eol = "additional tokens") = begin
+    result = next_token_default!(parser)
+    if result == ""
+        error("Expected $eol $(location(parser))")
+    else
+        result
+    end
+end
+
+next_token!(parser, eol = "additional tokens") = begin
+    result = next_token_with_space!(parser, eol)
+    if all(isspace, result)
+        next_token_with_space!(parser, eol)
+    else
+        result
+    end
+end
+
+expect(parser, result, eol) =
+    if result != eol
+        error("Expected $eol $(location(parser))")
+    end
+
+expect!(parser, eol) =
+    expect(parser, next_token!(parser, eol), eol)
+
+token_and_counter!(parser, eol = "}") = begin
+    token = next_token_with_space!(parser, eol)
+    if token == "{"
+        parser.bracket_counter += 1
+    elseif token == "}"
+        parser.bracket_counter -= 1
+    end
+    if parser.bracket_counter < 0
+        error("} without corresponding { $(location(parser))")
+    else
+        token
+    end
+end
+
+value!(parser, values = eltype(parser)[]) = begin
+    token = next_token!(parser)
+    if token == "\""
+        token = token_and_counter!(parser, "\"")
+        while !(token == "\"" && parser.bracket_counter == 0)
+            push!(values, token)
+            token = token_and_counter!(parser, "\" or }")
+        end
+    elseif token == "{"
+        parser.bracket_counter += 1
+        token = token_and_counter!(parser)
+        while parser.bracket_counter > 0
+            push!(values, token)
+            token = token_and_counter!(parser)
+        end
+    else
+        push!(values, getkey(parser.substitutions, token, String(token) ) )
+    end
+    token = next_token!(parser, ", or }")
+    if token == "#"
+        push!(values, " ")
+        value!(parser, values)
+    else
+        token, join(values)
+    end
+end
+
+field!(parser, dict) = begin
+    token = ","
+    while token == ","
+        token = next_token!(parser, "a new entry or }")
+        if token != "}"
+            key = lowercase(token)
+            if haskey(dict, key)
+                error("Duplicated field $key $(location(parser))")
+            else
+                expect!(parser, "=")
+                token, dict[key] = value!(parser)
+            end
+        end
+    end
+    expect(parser, token, "}")
+end
+
+"""
+    parse_bibtex(text)
+
+This is a simple input parser for BibTex. I had trouble finding a standard
+specification, but I've included several features of real BibTex. Returns
+a preamble (or an empty string) and a dict of dicts.
+
+```jldoctest
+julia> using BibTeX: parse_bibtex
+
+julia> preamble, result = parse_bibtex(""\"
+            @preamble{some instructions}
+            @comment blah blah
+            @string{short = long}
+            @a{b,
+              c = {{c} c},
+              d = "d {"} d",
+              e = f # short
+            }
+            ""\");
+
+julia> preamble
+"some instructions"
+
+julia> result["b"]["__type__"]
+"a"
+
+julia> result["b"]["c"]
+"{c} c"
+
+julia> result["b"]["d"]
+"d {\\"} d"
+
+julia> result["b"]["e"]
+"f short"
+
+julia> parse_bibtex("@book")
+ERROR: Expected { on line 1
+[...]
+
+julia> parse_bibtex("@book@")
+ERROR: Expected { on line 1
+[...]
+```
+
+Repeated fields and keys are not allowed:
+
+```jldoctest
+julia> using BibTeX: parse_bibtex
+
+julia> parse_bibtex(""\"
+            @book{abook,
+                title = A}
+            @book{abook,
+                title = B}
+        ""\")
+ERROR: Duplicated id abook on line 3
+[...]
+
+julia> parse_bibtex(""\"
+            @book{abook,
+                title = A,
+                title = B}
+        ""\")
+ERROR: Duplicated field title on line 3
+[...]
+```
+"""
+parse_bibtex(text) = begin
+    parser = parse_text(text)
+    token = next_token_default!(parser)
+    preamble = ""
+    while token != ""
+        if token == "@"
+            record_type = lowercase(next_token!(parser))
+            if record_type == "preamble"
+                trash, preamble = value!(parser)
+            elseif record_type != "comment"
+                expect!(parser, "{")
+                if record_type == "string"
+                    field!(parser, parser.substitutions)
+                else
+                    id = next_token!(parser)
+                    records = parser.records
+                    if haskey(records, id)
+                        error("Duplicated id $id $(location(parser))")
+                    else
+                        dict = Dict("__type__" => record_type)
+                        expect!(parser, ",")
+                        field!(parser, dict)
+                        records[id] = dict
+                    end
+                end
+            end
+        end
+        token = next_token_default!(parser)
+    end
+    preamble, parser.records
+end
--- a/test/REQUIRE
+++ b/test/REQUIRE
@ -0,0 +1 @@
+Documenter
--- a/test/benchmark.jl
+++ b/test/benchmark.jl
@ -0,0 +1,6 @@
+const file = joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bib") |> readstring
+
+using BenchmarkTools
+using BibTeX
+
+@benchmark BibTeX.parse_bibtex(file)
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -0,0 +1,58 @@
+using BibTeX, Base.Test
+
+base_file = dirname(dirname(@__FILE__))
+
+import Documenter
+Documenter.makedocs(
+    modules = [BibTeX],
+    format = :html,
+    sitename = "BibTeX.jl",
+    root = joinpath(base_file, "docs"),
+    pages = Any["Home" => "index.md"],
+    strict = true,
+    linkcheck = true,
+    checkdocs = :exports,
+    authors = "Brandon Taylor"
+)
+
+@testset "examples.bib" begin
+    # note: ".." does not work on windows
+    b = open(Bibliography, joinpath(base_file, "example", "examples.bib"), "r")
+    @test length(b) == 92
+    @test (b["angenendt"]::Citation{:article})["date"] == "2002"
+end
+
+@testset "small bib" begin
+    b = Bibliography("""
+    @article{foo, bar=baz}
+    @book{bar, foobar=1}
+    """)
+    @test get(b, "foobar", nothing) === nothing
+    @test get(b["foo"], "blah", nothing) === nothing
+
+    @test string(b["foo"]) == "Citation{:article}(1 entries)"
+
+    Base.rehash!(b)
+    b2 = copy(b)
+    @test length(b2) == length(b)
+    @test isempty(sizehint!(empty!(b2),10))
+    @test isempty(similar(b))
+    b2["x"] = Citation{:foo}()
+    b2["x"]["bar"] = "blah"
+    @test length(b2) == length(b2["x"]) == 1
+    @test b2["x"]["bar"] == "blah"
+    @test get(b2["x"], "foo", nothing) === nothing
+    @test collect(b2)[1][2] == b2["x"]
+    @test collect(b2["x"])[1] == ("bar"=>"blah")
+    Base.rehash!(b2["x"])
+    x2 = copy(b2["x"])::Citation{:foo}
+    @test length(x2) == 1
+    @test isempty(similar(x2))
+    @test isempty(sizehint!(empty!(x2),10))
+end
+
+import BibTeX: simplify_latex, markdown_directives
+@testset "latex" begin
+    @test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", markdown_directives) ==
+          "foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û"
+end
Author	SHA1	Message	Date
Brandon Taylor	673e30ce1f	cosmetic changes: word variables, avoiding short circuits, less returns	2017-08-06 14:34:42 -04:00
bramtayl	3f6dfb22d1	Merge pull request #6 from bramtayl/latex WIP: latex substitutions	2017-08-05 11:52:26 -04:00
Steven G. Johnson	93ecaf5ccf	just use Vector{UInt8} instead of raw pointer loads, since it's not clear we care about performance here	2017-08-03 15:55:09 -04:00
Steven G. Johnson	26bfe8d705	rudimentary equation support (sub/superscripts, elimination of dollar signs)	2017-08-03 12:16:31 -04:00
Steven G. Johnson	ab60bb59d8	implement custom directive parser	2017-08-03 09:40:14 -04:00
Brandon Taylor	54ecf83c40	fully fledged bracket counter	2017-08-03 09:17:14 -04:00
Brandon Taylor	1a8fad9cb5	Revert "move some tests into doctests" This reverts commit `54da038df9`.	2017-08-02 22:21:58 -04:00
Brandon Taylor	54da038df9	move some tests into doctests	2017-08-02 21:21:28 -04:00
Brandon Taylor	88c6e10e83	avoid inserting extraneous spaces	2017-08-02 20:04:31 -04:00
Brandon Taylor	75d1b6d74c	made duplicated fields and keys error for safety	2017-08-02 17:51:43 -04:00
Steven G. Johnson	9c8c04e950	initial attempt at latex substitutions	2017-08-02 12:44:40 -04:00
bramtayl	251f16ce9f	Merge pull request #4 from stevengj/bibtype construct Bib and BibItem types for better I/O	2017-08-01 22:45:51 -04:00
Steven G. Johnson	e6c0702811	more tests and fixes	2017-08-01 17:04:18 -04:00
Steven G. Johnson	457f4104d4	more tests and fixes, renamed to Bibliography and Citation	2017-08-01 16:51:03 -04:00
Steven G. Johnson	75809edb6b	test case-insensitivity of entry types and field keys	2017-08-01 12:07:50 -04:00
Steven G. Johnson	85e5456187	define Bib and BibItem types for better IO, rather than Dict	2017-08-01 12:03:48 -04:00
bramtayl	b16776190b	Update README.md	2017-07-31 15:06:06 -04:00
Brandon Taylor	6e1e18e89b	added test	2017-07-31 14:33:08 -04:00
Brandon Taylor	1c31cd6795	new files	2017-07-31 14:17:41 -04:00
Brandon Taylor	352997ae86	fix issues	2017-07-31 02:11:57 -04:00
Brandon Taylor	19b3a8804e	fixed some issues	2017-07-31 01:01:32 -04:00
Brandon Taylor	687aea8d99	small fixes	2017-07-30 12:54:45 -04:00
Brandon Taylor	d4d33933d4	added exports	2017-07-30 11:19:01 -04:00
Brandon Taylor	2054013b30	simpler	2017-07-30 11:10:48 -04:00
Brandon Taylor	233386140a	fixed tests	2017-07-30 11:07:41 -04:00
Brandon Taylor	10d0eebc31	added in parser	2017-07-30 11:02:08 -04:00
Brandon Taylor	e8e0983528	Generated files	2017-07-30 10:48:15 -04:00