Merge pull request #4 from stevengj/bibtype

construct Bib and BibItem types for better I/O
2017-08-01 22:45:51 -04:00 · 2017-08-01 22:45:51 -04:00 · 251f16ce9f
parent b16776190b e6c0702811
commit 251f16ce9f
7 changed files with 291 additions and 175 deletions
--- a/example/examples.bib
+++ b/example/examples.bib
@ -55,12 +55,12 @@
  indextitle   = {Effect of immobilization on catalytic characteristics},
 }
-@article{angenendt,
+@Article{angenendt,
  author       = {Angenendt, Arnold},
  title        = {In Honore Salvatoris~-- Vom Sinn und Unsinn der
                  Patrozinienkunde},
  journaltitle = {Revue d'Histoire Eccl{\'e}siastique},
-  date         = 2002,
+  Date         = 2002,
  volume       = 97,
  pages        = {431--456, 791--823},
  langid       = {german},
--- a/src/BibTeX.jl
+++ b/src/BibTeX.jl
@ -1,173 +1,8 @@
 module BibTeX
 export Bibliography, Citation
-struct Parser{T}
+include("parser.jl")
-    tokens::T
+include("citation.jl")
-    substitutions::Dict{String, String}
+include("bibliography.jl")
    records::Dict{String, Dict{String, String}}
    line::Ref{Int}
 end
 Base.eltype(p::Parser) = eltype(p.tokens)
 Base.one(p::Parser) = eltype(p)("")
 Parser(tokens::T, substitutions, records, line) where T =
    Parser{T}(tokens, substitutions, records, line)
 parse_text(text) = begin
    tokens = matchall(r"[^\s\n\"#{}@,=]+|\n|\"|#|{|}|@|,|=", text)
    Parser(tokens, Dict{String, String}(), Dict{String, String}(), Ref(1))
 end
 location(parser) = "on line $(parser.line.x)"
 next_token_default!(parser) =
    if isempty(parser.tokens)
        one(parser)
    else
        result = shift!(parser.tokens)
        if result == "\n"
            parser.line.x = parser.line.x + 1
            next_token_default!(parser)
        else
            result
        end
    end
 next_token!(parser, eol = "additional tokens") = begin
    result = next_token_default!(parser)
    if result == ""
        error("Expected $eol $(location(parser))")
    else
        result
    end
 end
 expect(parser, result, expectation) =
    if result != expectation
        error("Expected $expectation $(location(parser))")
    end
 expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation)
 token_and_counter!(parser, bracket_counter = 1) = begin
    token = next_token!(parser, "}")
    if token == "{"
        bracket_counter += 1
    elseif token == "}"
        bracket_counter -= 1
    end
    token, bracket_counter
 end
 value!(parser, values = eltype(parser)[]) = begin
    token = next_token!(parser)
    if token == "\""
        token = next_token!(parser, "\"")
        while token != "\""
            push!(values, token)
            token = next_token!(parser, "\"")
        end
    elseif token == "{"
        token, counter = token_and_counter!(parser)
        while counter > 0
            push!(values, token)
            token, counter = token_and_counter!(parser, counter)
        end
    else
        push!(values, getkey(parser.substitutions, token, String(token) ) )
    end
    token = next_token!(parser, ", or }")
    if token == "#"
        value!(parser, values)
    else
        token, join(values, " ")
    end
 end
 field!(parser, dict) = begin
    token = ","
    while token == ","
        token = next_token!(parser, "a new entry or }")
        if token != "}"
            key = token
            expect!(parser, "=")
            token, dict[key] = value!(parser)
        end
    end
    expect(parser, token, "}")
 end
 export parse_bibtex
 """
    parse_bibtex(text)
 This is a simple input parser for BibTex. I had trouble finding a standard
 specification, but I've included several features of real BibTex. Returns
 a preamble (or an empty string) and a dict of dicts.
 ```jldoctest
 julia> using BibTeX
 julia> preamble, result = parse_bibtex(""\"
            @preamble{some instructions}
            @comment blah blah
            @string{short = long}
            @a{b,
              c = { {c} c},
              d = "d d",
              e = f # short
            }
            ""\");
 julia> preamble
 "some instructions"
 julia> result["b"]["type"]
 "a"
 julia> result["b"]["c"]
 "{ c } c"
 julia> result["b"]["d"]
 "d d"
 julia> result["b"]["e"]
 "f short"
 julia> parse_bibtex("@book")
 ERROR: Expected { on line 1
 [...]
 julia> parse_bibtex("@book@")
 ERROR: Expected { on line 1
 [...]
 ```
 """
 parse_bibtex(text) = begin
    parser = parse_text(text)
    token = next_token_default!(parser)
    preamble = ""
    while token != ""
        if token == "@"
            record_type = lowercase(next_token!(parser))
            if record_type == "preamble"
                trash, preamble = value!(parser)
            elseif record_type != "comment"
                expect!(parser, "{")
                if record_type == "string"
                    field!(parser, parser.substitutions)
                else
                    id = next_token!(parser)
                    dict = Dict("type" => record_type)
                    expect!(parser, ",")
                    field!(parser, dict)
                    parser.records[id] = dict
                end
            end
        end
        token = next_token_default!(parser)
    end
    preamble, parser.records
 end
 end
--- a/src/bibliography.jl
+++ b/src/bibliography.jl
@ -0,0 +1,39 @@
 struct Bibliography <: Associative{String,Citation}
    preamble::String
    data::Dict{String,Citation}
 end
 """
    Bibliography(bibtex::String)
    Bibliography(io::IO)
 Given a string (or IO stream) of bibtex-format bibliography data,
 parses the data and returns a `Dict`-like object `b::Bibliography` that
 behaves as a dictionary mapping strings to bibliography items
 [`Citation`](@ref).
 """
 function Bibliography(bibtex::String)
    preamble, data = parse_bibtex(bibtex)
    return Bibliography(preamble, Dict(k=>Citation!(v) for (k,v) in data))
 end
 Bibliography(io::IO) = Bibliography(readstring(io))
 Base.open(::Type{Bibliography}, args...) = open(io -> Bibliography(io), args...)
 Base.similar(b::Bibliography) = Bibliography("", Dict{String,Citation}())
 Base.rehash!(b::Bibliography, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end
 Base.sizehint!(b::Bibliography, n) = begin sizehint!(b.data, n); b; end
 Base.empty!(b::Bibliography) = begin empty!(b.data); b; end
 Base.copy(b::Bibliography) = Bibliography(b.preamble, copy(b.data))
 function Base.setindex!(b::Bibliography, v::Citation, k::AbstractString)
    b.data[String(k)] = v
    return b
 end
 Base.get(b::Bibliography, k::AbstractString, default) = get(b.data, String(k), default)
 Base.start(b::Bibliography) = start(b.data)
 Base.done(b::Bibliography, i) = done(b.data, i)
 Base.next(b::Bibliography, i) = next(b.data, i)
 Base.length(b::Bibliography) = length(b.data)
 # todo: add specialized Base.show methods for MIME"text/bibtex" etc.
--- a/src/citation.jl
+++ b/src/citation.jl
@ -0,0 +1,42 @@
 """
    Citation{S}(data::Dict{String,String})
 A bibliography item in a bibTeX database, based on a dictionary of
 strings to values.  It is parameterized by a symbol `S` giving the
 type of the item (`:article` etcetera).  A `b::Citation` supports
 `b[key]` access to retrieve the data and in general acts like
 a dictionary from `String` to `String`.
 """
 struct Citation{S} <: Associative{String,String}
    data::Dict{String,String}
 end
 Citation{S}() where {S} = Citation{S}(Dict{String,String}())
 function Citation!(data::Dict{String,String})
    S = Symbol(pop!(data, "__type__"))
    return Citation{S}(data)
 end
 Base.similar(b::Citation{S}) where {S} = Citation{S}(Dict{String,String}())
 Base.rehash!(b::Citation, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end
 Base.sizehint!(b::Citation, n) = begin sizehint!(b.data, n); b; end
 Base.empty!(b::Citation) = begin empty!(b.data); b; end
 Base.copy(b::Citation{S}) where {S} = Citation{S}(copy(b.data))
 Base.get(b::Citation, k::AbstractString, default) = get(b.data, String(k), default)
 Base.getindex(b::Citation, k::AbstractString) = getindex(b.data, String(k))
 function Base.setindex!(b::Citation, v::AbstractString, k::AbstractString)
    b.data[String(k)] = String(v)
    return b
 end
 Base.start(b::Citation) = start(b.data)
 Base.done(b::Citation, i) = done(b.data, i)
 Base.next(b::Citation, i) = next(b.data, i)
 Base.length(b::Citation) = length(b.data)
 function Base.show{S}(io::IO, b::Citation{S})
    print(io, "Citation{:$S}(", length(b), " entries)")
 end
 # TODO: add Base.show text/plain and text/markdown for formatted citation
--- a/src/parser.jl
+++ b/src/parser.jl
@ -0,0 +1,168 @@
 struct Parser{T}
    tokens::T
    substitutions::Dict{String, String}
    records::Dict{String, Dict{String, String}}
    line::Ref{Int}
 end
 Base.eltype(p::Parser) = eltype(p.tokens)
 Base.one(p::Parser) = eltype(p)("")
 Parser(tokens::T, substitutions, records, line) where T =
    Parser{T}(tokens, substitutions, records, line)
 parse_text(text) = begin
    tokens = matchall(r"[^\s\n\"#{}@,=]+|\n|\"|#|{|}|@|,|=", text)
    Parser(tokens, Dict{String, String}(), Dict{String, String}(), Ref(1))
 end
 location(parser) = "on line $(parser.line.x)"
 next_token_default!(parser) =
    if isempty(parser.tokens)
        one(parser)
    else
        result = shift!(parser.tokens)
        if result == "\n"
            parser.line.x = parser.line.x + 1
            next_token_default!(parser)
        else
            result
        end
    end
 next_token!(parser, eol = "additional tokens") = begin
    result = next_token_default!(parser)
    if result == ""
        error("Expected $eol $(location(parser))")
    else
        result
    end
 end
 expect(parser, result, expectation) =
    if result != expectation
        error("Expected $expectation $(location(parser))")
    end
 expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation)
 token_and_counter!(parser, bracket_counter = 1) = begin
    token = next_token!(parser, "}")
    if token == "{"
        bracket_counter += 1
    elseif token == "}"
        bracket_counter -= 1
    end
    token, bracket_counter
 end
 value!(parser, values = eltype(parser)[]) = begin
    token = next_token!(parser)
    if token == "\""
        token = next_token!(parser, "\"")
        while token != "\""
            push!(values, token)
            token = next_token!(parser, "\"")
        end
    elseif token == "{"
        token, counter = token_and_counter!(parser)
        while counter > 0
            push!(values, token)
            token, counter = token_and_counter!(parser, counter)
        end
    else
        push!(values, getkey(parser.substitutions, token, String(token) ) )
    end
    token = next_token!(parser, ", or }")
    if token == "#"
        value!(parser, values)
    else
        token, join(values, " ")
    end
 end
 field!(parser, dict) = begin
    token = ","
    while token == ","
        token = next_token!(parser, "a new entry or }")
        if token != "}"
            key = token
            expect!(parser, "=")
            token, dict[lowercase(key)] = value!(parser)
        end
    end
    expect(parser, token, "}")
 end
 """
    parse_bibtex(text)
 This is a simple input parser for BibTex. I had trouble finding a standard
 specification, but I've included several features of real BibTex. Returns
 a preamble (or an empty string) and a dict of dicts.
 ```jldoctest
 julia> using BibTeX: parse_bibtex
 julia> preamble, result = parse_bibtex(""\"
            @preamble{some instructions}
            @comment blah blah
            @string{short = long}
            @a{b,
              c = { {c} c},
              d = "d d",
              e = f # short
            }
            ""\");
 julia> preamble
 "some instructions"
 julia> result["b"]["__type__"]
 "a"
 julia> result["b"]["c"]
 "{ c } c"
 julia> result["b"]["d"]
 "d d"
 julia> result["b"]["e"]
 "f short"
 julia> parse_bibtex("@book")
 ERROR: Expected { on line 1
 [...]
 julia> parse_bibtex("@book@")
 ERROR: Expected { on line 1
 [...]
 ```
 """
 parse_bibtex(text) = begin
    parser = parse_text(text)
    token = next_token_default!(parser)
    preamble = ""
    while token != ""
        if token == "@"
            record_type = lowercase(next_token!(parser))
            if record_type == "preamble"
                trash, preamble = value!(parser)
            elseif record_type != "comment"
                expect!(parser, "{")
                if record_type == "string"
                    field!(parser, parser.substitutions)
                else
                    id = next_token!(parser)
                    dict = Dict("__type__" => record_type)
                    expect!(parser, ",")
                    field!(parser, dict)
                    parser.records[id] = dict
                end
            end
        end
        token = next_token_default!(parser)
    end
    preamble, parser.records
 end
--- a/test/benchmark.jl
+++ b/test/benchmark.jl
@ -3,4 +3,4 @@ const file = joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bi
 using BenchmarkTools
 using BibTeX
-@benchmark parse_bibtex(file)
+@benchmark BibTeX.parse_bibtex(file)
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -1,4 +1,4 @@
-using BibTeX
+using BibTeX, Base.Test
 import Documenter
 Documenter.makedocs(
@ -13,5 +13,37 @@ Documenter.makedocs(
    authors = "Brandon Taylor"
 )
-# just test if it parses (for now)
+@testset "examples.bib" begin
-joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bib") |> readstring |> parse_bibtex
+    b = open(Bibliography, joinpath("..", "example", "examples.bib"), "r")
    @test length(b) == 92
    @test (b["angenendt"]::Citation{:article})["date"] == "2002"
 end
@testset "small bib" begin
    b = Bibliography("""
    @article{foo, bar=baz}
    @book{bar, foobar=1}
    """)
    @test get(b, "foobar", nothing) === nothing
    @test get(b["foo"], "blah", nothing) === nothing
    @test string(b["foo"]) == "Citation{:article}(1 entries)"
    Base.rehash!(b)
    b2 = copy(b)
    @test length(b2) == length(b)
    @test isempty(sizehint!(empty!(b2),10))
    @test isempty(similar(b))
    b2["x"] = Citation{:foo}()
    b2["x"]["bar"] = "blah"
    @test length(b2) == length(b2["x"]) == 1
    @test b2["x"]["bar"] == "blah"
    @test get(b2["x"], "foo", nothing) === nothing
    @test collect(b2)[1][2] == b2["x"]
    @test collect(b2["x"])[1] == ("bar"=>"blah")
    Base.rehash!(b2["x"])
    x2 = copy(b2["x"])::Citation{:foo}
    @test length(x2) == 1
    @test isempty(similar(x2))
    @test isempty(sizehint!(empty!(x2),10))
 end