define Bib and BibItem types for better IO, rather than Dict

This commit is contained in:
Steven G. Johnson 2017-08-01 12:03:48 -04:00
parent b16776190b
commit 85e5456187
6 changed files with 259 additions and 173 deletions

View File

@ -1,173 +1,8 @@
module BibTeX
export Bib, BibItem
struct Parser{T}
tokens::T
substitutions::Dict{String, String}
records::Dict{String, Dict{String, String}}
line::Ref{Int}
end
Base.eltype(p::Parser) = eltype(p.tokens)
Base.one(p::Parser) = eltype(p)("")
Parser(tokens::T, substitutions, records, line) where T =
Parser{T}(tokens, substitutions, records, line)
parse_text(text) = begin
tokens = matchall(r"[^\s\n\"#{}@,=]+|\n|\"|#|{|}|@|,|=", text)
Parser(tokens, Dict{String, String}(), Dict{String, String}(), Ref(1))
end
location(parser) = "on line $(parser.line.x)"
next_token_default!(parser) =
if isempty(parser.tokens)
one(parser)
else
result = shift!(parser.tokens)
if result == "\n"
parser.line.x = parser.line.x + 1
next_token_default!(parser)
else
result
end
end
next_token!(parser, eol = "additional tokens") = begin
result = next_token_default!(parser)
if result == ""
error("Expected $eol $(location(parser))")
else
result
end
end
expect(parser, result, expectation) =
if result != expectation
error("Expected $expectation $(location(parser))")
end
expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation)
token_and_counter!(parser, bracket_counter = 1) = begin
token = next_token!(parser, "}")
if token == "{"
bracket_counter += 1
elseif token == "}"
bracket_counter -= 1
end
token, bracket_counter
end
value!(parser, values = eltype(parser)[]) = begin
token = next_token!(parser)
if token == "\""
token = next_token!(parser, "\"")
while token != "\""
push!(values, token)
token = next_token!(parser, "\"")
end
elseif token == "{"
token, counter = token_and_counter!(parser)
while counter > 0
push!(values, token)
token, counter = token_and_counter!(parser, counter)
end
else
push!(values, getkey(parser.substitutions, token, String(token) ) )
end
token = next_token!(parser, ", or }")
if token == "#"
value!(parser, values)
else
token, join(values, " ")
end
end
field!(parser, dict) = begin
token = ","
while token == ","
token = next_token!(parser, "a new entry or }")
if token != "}"
key = token
expect!(parser, "=")
token, dict[key] = value!(parser)
end
end
expect(parser, token, "}")
end
export parse_bibtex
"""
parse_bibtex(text)
This is a simple input parser for BibTex. I had trouble finding a standard
specification, but I've included several features of real BibTex. Returns
a preamble (or an empty string) and a dict of dicts.
```jldoctest
julia> using BibTeX
julia> preamble, result = parse_bibtex(""\"
@preamble{some instructions}
@comment blah blah
@string{short = long}
@a{b,
c = { {c} c},
d = "d d",
e = f # short
}
""\");
julia> preamble
"some instructions"
julia> result["b"]["type"]
"a"
julia> result["b"]["c"]
"{ c } c"
julia> result["b"]["d"]
"d d"
julia> result["b"]["e"]
"f short"
julia> parse_bibtex("@book")
ERROR: Expected { on line 1
[...]
julia> parse_bibtex("@book@")
ERROR: Expected { on line 1
[...]
```
"""
parse_bibtex(text) = begin
parser = parse_text(text)
token = next_token_default!(parser)
preamble = ""
while token != ""
if token == "@"
record_type = lowercase(next_token!(parser))
if record_type == "preamble"
trash, preamble = value!(parser)
elseif record_type != "comment"
expect!(parser, "{")
if record_type == "string"
field!(parser, parser.substitutions)
else
id = next_token!(parser)
dict = Dict("type" => record_type)
expect!(parser, ",")
field!(parser, dict)
parser.records[id] = dict
end
end
end
token = next_token_default!(parser)
end
preamble, parser.records
end
include("parser.jl")
include("bibitem.jl")
include("bib.jl")
end

39
src/bib.jl Normal file
View File

@ -0,0 +1,39 @@
struct Bib <: Associative{String,BibItem}
preamble::String
data::Dict{String,BibItem}
end
"""
Bib(bibtex::String)
Bib(io::IO)
Given a string (or IO stream) of bibtex-format bibliography data,
parses the data and returns a `Dict`-like object `b::Bib` that
behaves as a dictionary mapping strings to bibliography items
[`BibItem`](@ref).
"""
function Bib(bibtex::String)
preamble, data = parse_bibtex(bibtex)
return Bib(preamble, Dict(k=>BibItem!(v) for (k,v) in data))
end
Bib(io::IO) = Bib(readstring(io))
Base.open(::Type{Bib}, args...) = open(io -> Bib(io), args...)
Base.similar(b::Bib) = Bib("", Dict{String,BibItem}())
Base.rehash!(b::Bib, n=length(b.data)) = begin rehash!(b.data, n); b; end
Base.sizehint!(b::Bib, n) = begin sizehint!(b.data, n); b; end
Base.empty!(b::Bib) = begin empty!(b.data); b; end
Base.copy(b::Bib) = Bib(b.preamble, copy(b.data))
function Base.setindex!(b::Bib, v::BibItem, k::AbstractString)
setindex!(b.data[String(k)], v)
return b
end
Base.get(b::Bib, k::AbstractString, default) = get(b.data, String(k), default)
Base.start(b::Bib) = start(b.data)
Base.done(b::Bib, i) = done(b.data, i)
Base.next(b::Bib, i) = next(b.data, i)
Base.length(b::Bib) = length(b.data)
# todo: add specialized Base.show methods for MIME"text/bibtex" etc.

41
src/bibitem.jl Normal file
View File

@ -0,0 +1,41 @@
"""
BibItem{S}(data::Dict{String,String})
A bibliography item in a bibTeX database, based on a dictionary of
strings to values. It is parameterized by a symbol `S` giving the
type of the item (`:article` etcetera). A `b::BibItem` supports
`b[key]` access to retrieve the data and in general acts like
a dictionary from `String` to `String`.
"""
struct BibItem{S} <: Associative{String,String}
data::Dict{String,String}
end
function BibItem!(data::Dict{String,String})
S = Symbol(pop!(data, "__type__"))
return BibItem{S}(data)
end
Base.similar(b::BibItem{S}) where {S} = BibItem{S}(Dict{String,String}())
Base.rehash!(b::BibItem, n=length(b.data)) = begin rehash!(b.data, n); b; end
Base.sizehint!(b::BibItem, n) = begin sizehint!(b.data, n); b; end
Base.empty!(b::BibItem) = begin empty!(b.data); b; end
Base.copy(b::BibItem{S}) where {S} = BibItem{S}(copy(b.data))
Base.get(b::BibItem, k::AbstractString, default) = get(b.data, String(k), default)
Base.getindex(b::BibItem, k::AbstractString) = getindex(b.data, String(k))
function Base.setindex!(b::BibItem, v::AbstractString, k::AbstractString)
b.data[String(k)] = String(v)
return b
end
Base.start(b::BibItem) = start(b.data)
Base.done(b::BibItem, i) = done(b.data, i)
Base.next(b::BibItem, i) = next(b.data, i)
Base.length(b::BibItem) = length(b.data)
function Base.show{S}(io::IO, b::BibItem{S})
print(io, "BibItem{:$S}(", length(b), " entries)")
end
# TODO: add Base.show text/plain and text/markdown for formatted citation

168
src/parser.jl Normal file
View File

@ -0,0 +1,168 @@
struct Parser{T}
tokens::T
substitutions::Dict{String, String}
records::Dict{String, Dict{String, String}}
line::Ref{Int}
end
Base.eltype(p::Parser) = eltype(p.tokens)
Base.one(p::Parser) = eltype(p)("")
Parser(tokens::T, substitutions, records, line) where T =
Parser{T}(tokens, substitutions, records, line)
parse_text(text) = begin
tokens = matchall(r"[^\s\n\"#{}@,=]+|\n|\"|#|{|}|@|,|=", text)
Parser(tokens, Dict{String, String}(), Dict{String, String}(), Ref(1))
end
location(parser) = "on line $(parser.line.x)"
next_token_default!(parser) =
if isempty(parser.tokens)
one(parser)
else
result = shift!(parser.tokens)
if result == "\n"
parser.line.x = parser.line.x + 1
next_token_default!(parser)
else
result
end
end
next_token!(parser, eol = "additional tokens") = begin
result = next_token_default!(parser)
if result == ""
error("Expected $eol $(location(parser))")
else
result
end
end
expect(parser, result, expectation) =
if result != expectation
error("Expected $expectation $(location(parser))")
end
expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation)
token_and_counter!(parser, bracket_counter = 1) = begin
token = next_token!(parser, "}")
if token == "{"
bracket_counter += 1
elseif token == "}"
bracket_counter -= 1
end
token, bracket_counter
end
value!(parser, values = eltype(parser)[]) = begin
token = next_token!(parser)
if token == "\""
token = next_token!(parser, "\"")
while token != "\""
push!(values, token)
token = next_token!(parser, "\"")
end
elseif token == "{"
token, counter = token_and_counter!(parser)
while counter > 0
push!(values, token)
token, counter = token_and_counter!(parser, counter)
end
else
push!(values, getkey(parser.substitutions, token, String(token) ) )
end
token = next_token!(parser, ", or }")
if token == "#"
value!(parser, values)
else
token, join(values, " ")
end
end
field!(parser, dict) = begin
token = ","
while token == ","
token = next_token!(parser, "a new entry or }")
if token != "}"
key = token
expect!(parser, "=")
token, dict[key] = value!(parser)
end
end
expect(parser, token, "}")
end
"""
parse_bibtex(text)
This is a simple input parser for BibTex. I had trouble finding a standard
specification, but I've included several features of real BibTex. Returns
a preamble (or an empty string) and a dict of dicts.
```jldoctest
julia> using BibTeX: parse_bibtex
julia> preamble, result = parse_bibtex(""\"
@preamble{some instructions}
@comment blah blah
@string{short = long}
@a{b,
c = { {c} c},
d = "d d",
e = f # short
}
""\");
julia> preamble
"some instructions"
julia> result["b"]["__type__"]
"a"
julia> result["b"]["c"]
"{ c } c"
julia> result["b"]["d"]
"d d"
julia> result["b"]["e"]
"f short"
julia> parse_bibtex("@book")
ERROR: Expected { on line 1
[...]
julia> parse_bibtex("@book@")
ERROR: Expected { on line 1
[...]
```
"""
parse_bibtex(text) = begin
parser = parse_text(text)
token = next_token_default!(parser)
preamble = ""
while token != ""
if token == "@"
record_type = lowercase(next_token!(parser))
if record_type == "preamble"
trash, preamble = value!(parser)
elseif record_type != "comment"
expect!(parser, "{")
if record_type == "string"
field!(parser, parser.substitutions)
else
id = next_token!(parser)
dict = Dict("__type__" => record_type)
expect!(parser, ",")
field!(parser, dict)
parser.records[id] = dict
end
end
end
token = next_token_default!(parser)
end
preamble, parser.records
end

View File

@ -3,4 +3,4 @@ const file = joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bi
using BenchmarkTools
using BibTeX
@benchmark parse_bibtex(file)
@benchmark BibTeX.parse_bibtex(file)

View File

@ -1,4 +1,4 @@
using BibTeX
using BibTeX, Base.Test
import Documenter
Documenter.makedocs(
@ -13,5 +13,8 @@ Documenter.makedocs(
authors = "Brandon Taylor"
)
# just test if it parses (for now)
joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bib") |> readstring |> parse_bibtex
@testset "examples.bib" begin
b = open(Bib, joinpath("..", "example", "examples.bib"), "r")
@test length(b) == 92
@test (b["angenendt"]::BibItem{:article})["date"] == "2002"
end