diff --git a/example/examples.bib b/example/examples.bib index 36ac849..b784a7d 100644 --- a/example/examples.bib +++ b/example/examples.bib @@ -55,12 +55,12 @@ indextitle = {Effect of immobilization on catalytic characteristics}, } -@article{angenendt, +@Article{angenendt, author = {Angenendt, Arnold}, title = {In Honore Salvatoris~-- Vom Sinn und Unsinn der Patrozinienkunde}, journaltitle = {Revue d'Histoire Eccl{\'e}siastique}, - date = 2002, + Date = 2002, volume = 97, pages = {431--456, 791--823}, langid = {german}, diff --git a/src/BibTeX.jl b/src/BibTeX.jl index 3509e06..b17118d 100644 --- a/src/BibTeX.jl +++ b/src/BibTeX.jl @@ -1,173 +1,8 @@ module BibTeX +export Bibliography, Citation -struct Parser{T} - tokens::T - substitutions::Dict{String, String} - records::Dict{String, Dict{String, String}} - line::Ref{Int} -end - -Base.eltype(p::Parser) = eltype(p.tokens) -Base.one(p::Parser) = eltype(p)("") - -Parser(tokens::T, substitutions, records, line) where T = - Parser{T}(tokens, substitutions, records, line) - -parse_text(text) = begin - tokens = matchall(r"[^\s\n\"#{}@,=]+|\n|\"|#|{|}|@|,|=", text) - Parser(tokens, Dict{String, String}(), Dict{String, String}(), Ref(1)) -end - -location(parser) = "on line $(parser.line.x)" - -next_token_default!(parser) = - if isempty(parser.tokens) - one(parser) - else - result = shift!(parser.tokens) - if result == "\n" - parser.line.x = parser.line.x + 1 - next_token_default!(parser) - else - result - end - end - -next_token!(parser, eol = "additional tokens") = begin - result = next_token_default!(parser) - if result == "" - error("Expected $eol $(location(parser))") - else - result - end -end - -expect(parser, result, expectation) = - if result != expectation - error("Expected $expectation $(location(parser))") - end - -expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation) - -token_and_counter!(parser, bracket_counter = 1) = begin - token = next_token!(parser, "}") - if token == "{" - bracket_counter += 1 - elseif token == "}" - bracket_counter -= 1 - end - token, bracket_counter -end - -value!(parser, values = eltype(parser)[]) = begin - token = next_token!(parser) - if token == "\"" - token = next_token!(parser, "\"") - while token != "\"" - push!(values, token) - token = next_token!(parser, "\"") - end - elseif token == "{" - token, counter = token_and_counter!(parser) - while counter > 0 - push!(values, token) - token, counter = token_and_counter!(parser, counter) - end - else - push!(values, getkey(parser.substitutions, token, String(token) ) ) - end - token = next_token!(parser, ", or }") - if token == "#" - value!(parser, values) - else - token, join(values, " ") - end -end - -field!(parser, dict) = begin - token = "," - while token == "," - token = next_token!(parser, "a new entry or }") - if token != "}" - key = token - expect!(parser, "=") - token, dict[key] = value!(parser) - end - end - expect(parser, token, "}") -end - -export parse_bibtex -""" - parse_bibtex(text) - -This is a simple input parser for BibTex. I had trouble finding a standard -specification, but I've included several features of real BibTex. Returns -a preamble (or an empty string) and a dict of dicts. - -```jldoctest -julia> using BibTeX - -julia> preamble, result = parse_bibtex(""\" - @preamble{some instructions} - @comment blah blah - @string{short = long} - @a{b, - c = { {c} c}, - d = "d d", - e = f # short - } - ""\"); - -julia> preamble -"some instructions" - -julia> result["b"]["type"] -"a" - -julia> result["b"]["c"] -"{ c } c" - -julia> result["b"]["d"] -"d d" - -julia> result["b"]["e"] -"f short" - -julia> parse_bibtex("@book") -ERROR: Expected { on line 1 -[...] - -julia> parse_bibtex("@book@") -ERROR: Expected { on line 1 -[...] -``` -""" -parse_bibtex(text) = begin - parser = parse_text(text) - token = next_token_default!(parser) - preamble = "" - while token != "" - if token == "@" - record_type = lowercase(next_token!(parser)) - if record_type == "preamble" - trash, preamble = value!(parser) - elseif record_type != "comment" - expect!(parser, "{") - if record_type == "string" - field!(parser, parser.substitutions) - else - id = next_token!(parser) - dict = Dict("type" => record_type) - expect!(parser, ",") - field!(parser, dict) - parser.records[id] = dict - end - end - end - token = next_token_default!(parser) - end - preamble, parser.records -end +include("parser.jl") +include("citation.jl") +include("bibliography.jl") end diff --git a/src/bibliography.jl b/src/bibliography.jl new file mode 100644 index 0000000..8d727dc --- /dev/null +++ b/src/bibliography.jl @@ -0,0 +1,39 @@ +struct Bibliography <: Associative{String,Citation} + preamble::String + data::Dict{String,Citation} +end + +""" + Bibliography(bibtex::String) + Bibliography(io::IO) + +Given a string (or IO stream) of bibtex-format bibliography data, +parses the data and returns a `Dict`-like object `b::Bibliography` that +behaves as a dictionary mapping strings to bibliography items +[`Citation`](@ref). +""" +function Bibliography(bibtex::String) + preamble, data = parse_bibtex(bibtex) + return Bibliography(preamble, Dict(k=>Citation!(v) for (k,v) in data)) +end +Bibliography(io::IO) = Bibliography(readstring(io)) +Base.open(::Type{Bibliography}, args...) = open(io -> Bibliography(io), args...) + +Base.similar(b::Bibliography) = Bibliography("", Dict{String,Citation}()) +Base.rehash!(b::Bibliography, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end +Base.sizehint!(b::Bibliography, n) = begin sizehint!(b.data, n); b; end +Base.empty!(b::Bibliography) = begin empty!(b.data); b; end +Base.copy(b::Bibliography) = Bibliography(b.preamble, copy(b.data)) + +function Base.setindex!(b::Bibliography, v::Citation, k::AbstractString) + b.data[String(k)] = v + return b +end +Base.get(b::Bibliography, k::AbstractString, default) = get(b.data, String(k), default) + +Base.start(b::Bibliography) = start(b.data) +Base.done(b::Bibliography, i) = done(b.data, i) +Base.next(b::Bibliography, i) = next(b.data, i) +Base.length(b::Bibliography) = length(b.data) + +# todo: add specialized Base.show methods for MIME"text/bibtex" etc. diff --git a/src/citation.jl b/src/citation.jl new file mode 100644 index 0000000..9204eeb --- /dev/null +++ b/src/citation.jl @@ -0,0 +1,42 @@ +""" + Citation{S}(data::Dict{String,String}) + +A bibliography item in a bibTeX database, based on a dictionary of +strings to values. It is parameterized by a symbol `S` giving the +type of the item (`:article` etcetera). A `b::Citation` supports +`b[key]` access to retrieve the data and in general acts like +a dictionary from `String` to `String`. +""" +struct Citation{S} <: Associative{String,String} + data::Dict{String,String} +end +Citation{S}() where {S} = Citation{S}(Dict{String,String}()) + +function Citation!(data::Dict{String,String}) + S = Symbol(pop!(data, "__type__")) + return Citation{S}(data) +end + +Base.similar(b::Citation{S}) where {S} = Citation{S}(Dict{String,String}()) +Base.rehash!(b::Citation, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end +Base.sizehint!(b::Citation, n) = begin sizehint!(b.data, n); b; end +Base.empty!(b::Citation) = begin empty!(b.data); b; end +Base.copy(b::Citation{S}) where {S} = Citation{S}(copy(b.data)) + +Base.get(b::Citation, k::AbstractString, default) = get(b.data, String(k), default) +Base.getindex(b::Citation, k::AbstractString) = getindex(b.data, String(k)) +function Base.setindex!(b::Citation, v::AbstractString, k::AbstractString) + b.data[String(k)] = String(v) + return b +end + +Base.start(b::Citation) = start(b.data) +Base.done(b::Citation, i) = done(b.data, i) +Base.next(b::Citation, i) = next(b.data, i) +Base.length(b::Citation) = length(b.data) + +function Base.show{S}(io::IO, b::Citation{S}) + print(io, "Citation{:$S}(", length(b), " entries)") +end + +# TODO: add Base.show text/plain and text/markdown for formatted citation diff --git a/src/parser.jl b/src/parser.jl new file mode 100644 index 0000000..f1382b3 --- /dev/null +++ b/src/parser.jl @@ -0,0 +1,168 @@ +struct Parser{T} + tokens::T + substitutions::Dict{String, String} + records::Dict{String, Dict{String, String}} + line::Ref{Int} +end + +Base.eltype(p::Parser) = eltype(p.tokens) +Base.one(p::Parser) = eltype(p)("") + +Parser(tokens::T, substitutions, records, line) where T = + Parser{T}(tokens, substitutions, records, line) + +parse_text(text) = begin + tokens = matchall(r"[^\s\n\"#{}@,=]+|\n|\"|#|{|}|@|,|=", text) + Parser(tokens, Dict{String, String}(), Dict{String, String}(), Ref(1)) +end + +location(parser) = "on line $(parser.line.x)" + +next_token_default!(parser) = + if isempty(parser.tokens) + one(parser) + else + result = shift!(parser.tokens) + if result == "\n" + parser.line.x = parser.line.x + 1 + next_token_default!(parser) + else + result + end + end + +next_token!(parser, eol = "additional tokens") = begin + result = next_token_default!(parser) + if result == "" + error("Expected $eol $(location(parser))") + else + result + end +end + +expect(parser, result, expectation) = + if result != expectation + error("Expected $expectation $(location(parser))") + end + +expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation) + +token_and_counter!(parser, bracket_counter = 1) = begin + token = next_token!(parser, "}") + if token == "{" + bracket_counter += 1 + elseif token == "}" + bracket_counter -= 1 + end + token, bracket_counter +end + +value!(parser, values = eltype(parser)[]) = begin + token = next_token!(parser) + if token == "\"" + token = next_token!(parser, "\"") + while token != "\"" + push!(values, token) + token = next_token!(parser, "\"") + end + elseif token == "{" + token, counter = token_and_counter!(parser) + while counter > 0 + push!(values, token) + token, counter = token_and_counter!(parser, counter) + end + else + push!(values, getkey(parser.substitutions, token, String(token) ) ) + end + token = next_token!(parser, ", or }") + if token == "#" + value!(parser, values) + else + token, join(values, " ") + end +end + +field!(parser, dict) = begin + token = "," + while token == "," + token = next_token!(parser, "a new entry or }") + if token != "}" + key = token + expect!(parser, "=") + token, dict[lowercase(key)] = value!(parser) + end + end + expect(parser, token, "}") +end + +""" + parse_bibtex(text) + +This is a simple input parser for BibTex. I had trouble finding a standard +specification, but I've included several features of real BibTex. Returns +a preamble (or an empty string) and a dict of dicts. + +```jldoctest +julia> using BibTeX: parse_bibtex + +julia> preamble, result = parse_bibtex(""\" + @preamble{some instructions} + @comment blah blah + @string{short = long} + @a{b, + c = { {c} c}, + d = "d d", + e = f # short + } + ""\"); + +julia> preamble +"some instructions" + +julia> result["b"]["__type__"] +"a" + +julia> result["b"]["c"] +"{ c } c" + +julia> result["b"]["d"] +"d d" + +julia> result["b"]["e"] +"f short" + +julia> parse_bibtex("@book") +ERROR: Expected { on line 1 +[...] + +julia> parse_bibtex("@book@") +ERROR: Expected { on line 1 +[...] +``` +""" +parse_bibtex(text) = begin + parser = parse_text(text) + token = next_token_default!(parser) + preamble = "" + while token != "" + if token == "@" + record_type = lowercase(next_token!(parser)) + if record_type == "preamble" + trash, preamble = value!(parser) + elseif record_type != "comment" + expect!(parser, "{") + if record_type == "string" + field!(parser, parser.substitutions) + else + id = next_token!(parser) + dict = Dict("__type__" => record_type) + expect!(parser, ",") + field!(parser, dict) + parser.records[id] = dict + end + end + end + token = next_token_default!(parser) + end + preamble, parser.records +end diff --git a/test/benchmark.jl b/test/benchmark.jl index 737dad3..cb868d1 100644 --- a/test/benchmark.jl +++ b/test/benchmark.jl @@ -3,4 +3,4 @@ const file = joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bi using BenchmarkTools using BibTeX -@benchmark parse_bibtex(file) +@benchmark BibTeX.parse_bibtex(file) diff --git a/test/runtests.jl b/test/runtests.jl index 1990dd9..0831462 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,4 @@ -using BibTeX +using BibTeX, Base.Test import Documenter Documenter.makedocs( @@ -13,5 +13,37 @@ Documenter.makedocs( authors = "Brandon Taylor" ) -# just test if it parses (for now) -joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bib") |> readstring |> parse_bibtex +@testset "examples.bib" begin + b = open(Bibliography, joinpath("..", "example", "examples.bib"), "r") + @test length(b) == 92 + @test (b["angenendt"]::Citation{:article})["date"] == "2002" +end + +@testset "small bib" begin + b = Bibliography(""" + @article{foo, bar=baz} + @book{bar, foobar=1} + """) + @test get(b, "foobar", nothing) === nothing + @test get(b["foo"], "blah", nothing) === nothing + + @test string(b["foo"]) == "Citation{:article}(1 entries)" + + Base.rehash!(b) + b2 = copy(b) + @test length(b2) == length(b) + @test isempty(sizehint!(empty!(b2),10)) + @test isempty(similar(b)) + b2["x"] = Citation{:foo}() + b2["x"]["bar"] = "blah" + @test length(b2) == length(b2["x"]) == 1 + @test b2["x"]["bar"] == "blah" + @test get(b2["x"], "foo", nothing) === nothing + @test collect(b2)[1][2] == b2["x"] + @test collect(b2["x"])[1] == ("bar"=>"blah") + Base.rehash!(b2["x"]) + x2 = copy(b2["x"])::Citation{:foo} + @test length(x2) == 1 + @test isempty(similar(x2)) + @test isempty(sizehint!(empty!(x2),10)) +end