Compare commits

...

27 Commits

Author SHA1 Message Date
Brandon Taylor 673e30ce1f cosmetic changes: word variables, avoiding short circuits, less returns 2017-08-06 14:34:42 -04:00
bramtayl 3f6dfb22d1 Merge pull request #6 from bramtayl/latex
WIP: latex substitutions
2017-08-05 11:52:26 -04:00
Steven G. Johnson 93ecaf5ccf just use Vector{UInt8} instead of raw pointer loads, since it's not clear we care about performance here 2017-08-03 15:55:09 -04:00
Steven G. Johnson 26bfe8d705 rudimentary equation support (sub/superscripts, elimination of dollar signs) 2017-08-03 12:16:31 -04:00
Steven G. Johnson ab60bb59d8 implement custom directive parser 2017-08-03 09:40:14 -04:00
Brandon Taylor 54ecf83c40 fully fledged bracket counter 2017-08-03 09:17:14 -04:00
Brandon Taylor 1a8fad9cb5 Revert "move some tests into doctests"
This reverts commit 54da038df9.
2017-08-02 22:21:58 -04:00
Brandon Taylor 54da038df9 move some tests into doctests 2017-08-02 21:21:28 -04:00
Brandon Taylor 88c6e10e83 avoid inserting extraneous spaces 2017-08-02 20:04:31 -04:00
Brandon Taylor 75d1b6d74c made duplicated fields and keys error for safety 2017-08-02 17:51:43 -04:00
Steven G. Johnson 9c8c04e950 initial attempt at latex substitutions 2017-08-02 12:44:40 -04:00
bramtayl 251f16ce9f Merge pull request #4 from stevengj/bibtype
construct Bib and BibItem types for better I/O
2017-08-01 22:45:51 -04:00
Steven G. Johnson e6c0702811 more tests and fixes 2017-08-01 17:04:18 -04:00
Steven G. Johnson 457f4104d4 more tests and fixes, renamed to Bibliography and Citation 2017-08-01 16:51:03 -04:00
Steven G. Johnson 75809edb6b test case-insensitivity of entry types and field keys 2017-08-01 12:07:50 -04:00
Steven G. Johnson 85e5456187 define Bib and BibItem types for better IO, rather than Dict 2017-08-01 12:03:48 -04:00
bramtayl b16776190b Update README.md 2017-07-31 15:06:06 -04:00
Brandon Taylor 6e1e18e89b added test 2017-07-31 14:33:08 -04:00
Brandon Taylor 1c31cd6795 new files 2017-07-31 14:17:41 -04:00
Brandon Taylor 352997ae86 fix issues 2017-07-31 02:11:57 -04:00
Brandon Taylor 19b3a8804e fixed some issues 2017-07-31 01:01:32 -04:00
Brandon Taylor 687aea8d99 small fixes 2017-07-30 12:54:45 -04:00
Brandon Taylor d4d33933d4 added exports 2017-07-30 11:19:01 -04:00
Brandon Taylor 2054013b30 simpler 2017-07-30 11:10:48 -04:00
Brandon Taylor 233386140a fixed tests 2017-07-30 11:07:41 -04:00
Brandon Taylor 10d0eebc31 added in parser 2017-07-30 11:02:08 -04:00
Brandon Taylor e8e0983528 Generated files 2017-07-30 10:48:15 -04:00
19 changed files with 2532 additions and 0 deletions

1
.codecov Normal file
View File

@ -0,0 +1 @@
comment: false

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
*.jl.cov
*.jl.*.cov
*.jl.mem

14
.travis.yml Normal file
View File

@ -0,0 +1,14 @@
# Documentation: http://docs.travis-ci.com/user/languages/julia/
language: julia
os:
- linux
julia:
- 0.6
- nightly
notifications:
email: false
after_success:
# build documentation
- julia -e 'cd(Pkg.dir("BibTeX")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))'
# push coverage results to Codecov
- julia -e 'cd(Pkg.dir("BibTeX")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'

41
LICENSE.md Normal file
View File

@ -0,0 +1,41 @@
The BibTeX.jl package is licensed under the MIT "Expat" License:
> Copyright (c) 2017: Brandon Taylor.
>
>
> Permission is hereby granted, free of charge, to any person obtaining a copy
>
> of this software and associated documentation files (the "Software"), to deal
>
> in the Software without restriction, including without limitation the rights
>
> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>
> copies of the Software, and to permit persons to whom the Software is
>
> furnished to do so, subject to the following conditions:
>
>
>
> The above copyright notice and this permission notice shall be included in all
>
> copies or substantial portions of the Software.
>
>
>
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>
> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>
> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
>
> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>
> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
>
> SOFTWARE.
>
>

20
README.md Normal file
View File

@ -0,0 +1,20 @@
# BibTeX
[![travis badge][travis_badge]][travis_url]
[![codecov badge][codecov_badge]][codecov_url]
## Documentation [here][documenter_latest]
Change documentation link to `documenter_stable` once published!
[travis_badge]: https://travis-ci.org/bramtayl/BibTeX.jl.svg?branch=master
[travis_url]: https://travis-ci.org/bramtayl/BibTeX.jl
[appveyor_badge]: https://ci.appveyor.com/api/projects/status/github/bramtayl/BibTeX.jl?svg=true&branch=master
[appveyor_url]: https://ci.appveyor.com/project/bramtayl/bibtex-jl
[codecov_badge]: http://codecov.io/github/bramtayl/BibTeX.jl/coverage.svg?branch=master
[codecov_url]: http://codecov.io/github/bramtayl/BibTeX.jl?branch=master
[documenter_stable]: https://bramtayl.github.io/BibTeX.jl/stable
[documenter_latest]: https://bramtayl.github.io/BibTeX.jl/latest

1
REQUIRE Normal file
View File

@ -0,0 +1 @@
julia 0.6

26
appveyor.yml Normal file
View File

@ -0,0 +1,26 @@
environment:
matrix:
- JULIAVERSION: "julialang/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
- JULIAVERSION: "julialang/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
- JULIAVERSION: "julianightlies/bin/winnt/x86/julia-latest-win32.exe"
- JULIAVERSION: "julianightlies/bin/winnt/x64/julia-latest-win64.exe"
branches:
only:
- master
- /release-.*/
notifications:
- provider: Email
on_build_success: false
on_build_failure: false
on_build_status_changed: false
install:
- ps: (new-object net.webclient).DownloadFile(
$("http://s3.amazonaws.com/"+$env:JULIAVERSION),
"C:\projects\julia-binary.exe")
- C:\projects\julia-binary.exe /S /D=C:\projects\julia
build_script:
- IF EXIST .git\shallow (git fetch --unshallow)
- C:\projects\julia\bin\julia -e "versioninfo();
Pkg.clone(pwd(), \"BibTeX\"); Pkg.build(\"BibTeX\")"
test_script:
- C:\projects\julia\bin\julia -e "Pkg.test(\"BibTeX\")"

2
docs/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
build/
site/

8
docs/make.jl Normal file
View File

@ -0,0 +1,8 @@
import Documenter
Documenter.deploydocs(
repo = "github.com/bramtayl/BibTeX.jl.git",
target = "build",
deps = nothing,
make = nothing
)

8
docs/src/index.md Normal file
View File

@ -0,0 +1,8 @@
# BibTeX.jl
```@index
```
```@autodocs
Modules = [BibTeX]
```

1674
example/examples.bib Normal file

File diff suppressed because it is too large Load Diff

9
src/BibTeX.jl Normal file
View File

@ -0,0 +1,9 @@
module BibTeX
export Bibliography, Citation
include("parser.jl")
include("citation.jl")
include("bibliography.jl")
include("latex.jl")
end

39
src/bibliography.jl Normal file
View File

@ -0,0 +1,39 @@
struct Bibliography <: Associative{String,Citation}
preamble::String
data::Dict{String,Citation}
end
"""
Bibliography(bibtex::String)
Bibliography(io::IO)
Given a string (or IO stream) of bibtex-format bibliography data,
parses the data and returns a `Dict`-like object `b::Bibliography` that
behaves as a dictionary mapping strings to bibliography items
[`Citation`](@ref).
"""
function Bibliography(bibtex::String)
preamble, data = parse_bibtex(bibtex)
return Bibliography(preamble, Dict(k=>Citation!(v) for (k,v) in data))
end
Bibliography(io::IO) = Bibliography(readstring(io))
Base.open(::Type{Bibliography}, args...) = open(io -> Bibliography(io), args...)
Base.similar(b::Bibliography) = Bibliography("", Dict{String,Citation}())
Base.rehash!(b::Bibliography, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end
Base.sizehint!(b::Bibliography, n) = begin sizehint!(b.data, n); b; end
Base.empty!(b::Bibliography) = begin empty!(b.data); b; end
Base.copy(b::Bibliography) = Bibliography(b.preamble, copy(b.data))
function Base.setindex!(b::Bibliography, v::Citation, k::AbstractString)
b.data[String(k)] = v
return b
end
Base.get(b::Bibliography, k::AbstractString, default) = get(b.data, String(k), default)
Base.start(b::Bibliography) = start(b.data)
Base.done(b::Bibliography, i) = done(b.data, i)
Base.next(b::Bibliography, i) = next(b.data, i)
Base.length(b::Bibliography) = length(b.data)
# todo: add specialized Base.show methods for MIME"text/bibtex" etc.

42
src/citation.jl Normal file
View File

@ -0,0 +1,42 @@
"""
Citation{S}(data::Dict{String,String})
A bibliography item in a bibTeX database, based on a dictionary of
strings to values. It is parameterized by a symbol `S` giving the
type of the item (`:article` etcetera). A `b::Citation` supports
`b[key]` access to retrieve the data and in general acts like
a dictionary from `String` to `String`.
"""
struct Citation{S} <: Associative{String,String}
data::Dict{String,String}
end
Citation{S}() where {S} = Citation{S}(Dict{String,String}())
function Citation!(data::Dict{String,String})
S = Symbol(pop!(data, "__type__"))
return Citation{S}(data)
end
Base.similar(b::Citation{S}) where {S} = Citation{S}(Dict{String,String}())
Base.rehash!(b::Citation, n=length(b.data)) = begin Base.rehash!(b.data, n); b; end
Base.sizehint!(b::Citation, n) = begin sizehint!(b.data, n); b; end
Base.empty!(b::Citation) = begin empty!(b.data); b; end
Base.copy(b::Citation{S}) where {S} = Citation{S}(copy(b.data))
Base.get(b::Citation, k::AbstractString, default) = get(b.data, String(k), default)
Base.getindex(b::Citation, k::AbstractString) = getindex(b.data, String(k))
function Base.setindex!(b::Citation, v::AbstractString, k::AbstractString)
b.data[String(k)] = String(v)
return b
end
Base.start(b::Citation) = start(b.data)
Base.done(b::Citation, i) = done(b.data, i)
Base.next(b::Citation, i) = next(b.data, i)
Base.length(b::Citation) = length(b.data)
function Base.show{S}(io::IO, b::Citation{S})
print(io, "Citation{:$S}(", length(b), " entries)")
end
# TODO: add Base.show text/plain and text/markdown for formatted citation

361
src/latex.jl Normal file
View File

@ -0,0 +1,361 @@
# conversion of LaTeX directives to plain text, markdown, etc.
#
# The basic idea is that we search for `\foo{argument}`, `{\foo argument}`,
# or `{\foo{argument}}`, and look up `foo` in a dictionary of substitutions
# like `\textit` -> `*#1*` where #1 is where the (first) argument is
# substituted. Then we have separate dictionary entries for text/plain,
# text/markdown, etcetera.
###########################################################################
# parsing LaTeX directives:
const BACKSLASH = UInt8('\\')
const BRACE_OPEN = UInt8('{')
const BRACE_CLOSE = UInt8('}')
const SPACE = UInt8(' ')
const DOLLAR = UInt8('$')
const CARET = UInt8('^')
const UNDERSCORE = UInt8('_')
is_letter(x::UInt8) = UInt8('a') x UInt8('z') || UInt8('A') x UInt8('Z')
is_alphanumeric(x::UInt8) = UInt8('0') x UInt8('9') || is_letter(x)
"""
search_latex_directive(astring, start_position = 1, inbrace=false)
Search for a LaTeX directive \\directive{argument} or similar in `string`, returning
`(start_position, directive_end, argument_end)` such that `string[start_position:directive_end]` gives `\\directive` and `string[directive_end+1:argument_end]`
gives `{argument}`. Use [`strip_argument`](@ref) to remove surrounding braces and whitespace
from the `argument`.
"""
function search_latex_directive(astring, start_position = 1)
string_length = sizeof(astring)
if !(0 < start_position string_length)
0, 0, 0
else
character_vector = Vector{UInt8}(astring)
index = start_position
all_spaces = true
# find \foo directive or {...}:
character = UInt8(0)
while index string_length
character = character_vector[index]
if (character == BACKSLASH || character == BRACE_OPEN || character == CARET || character == UNDERSCORE)
break
end
if character != SPACE
all_spaces = false
end
index += 1
end
if index string_length && character != BRACE_OPEN
directive_start = index
if character == BACKSLASH
index += 2
if index - 1 > string_length
return 0,0,0
end
if is_letter(character_vector[index - 1])
while index string_length && is_letter(character_vector[index])
index += 1
end
end
directive_end = index - 1
else
directive_end = directive_start # ^ or _
index += 1
end
# look for optional opening brace
while index string_length && character_vector[index] == SPACE
index += 1
end
if index > string_length
return directive_start, directive_end, string_length
end
in_braces = character_vector[index] == BRACE_OPEN
if !in_braces
# search backwards from \foo to look for { \foo ...}
backwards_index = directive_start - 1
while backwards_index start_position && character_vector[backwards_index] == SPACE
backwards_index -= 1
end
if backwards_index < start_position || character_vector[backwards_index] != BRACE_OPEN
if character_vector[index] == BACKSLASH
# argument is another latex directive
inner_start_position, inner_directive_end, inner_argument_end = search_latex_directive(astring, index)
return directive_start, directive_end, inner_argument_end
elseif character != BACKSLASH
# in an equation, token is a single char
return directive_start, directive_end, index
elseif all_spaces
# if `\directive ...` was preceded only
# by whitespace, then assume arguments
# extend to the end of the string. This
# happens when we recurse on `{\directive ...}`.
return directive_start, directive_end, string_length
else
# argument is not in braces … get next token
while index string_length && is_alphanumeric(character_vector[index])
index += 1
end
return directive_start, directive_end, index - 1
end
end
end
index += 1
elseif index > string_length
return 0, 0, 0
else # { ... }
directive_start = index
directive_end = index - 1
in_braces = true
index += 1
end
# search for end of argument (closing brace)
number_of_braces = 1
while index string_length
character = character_vector[index]
if character == BRACE_OPEN
number_of_braces += 1
elseif character == BRACE_CLOSE
number_of_braces -= 1
if number_of_braces == 0
argument_end = if in_braces
index
else
index - 1
end
return directive_start, directive_end, argument_end
end
end
index += 1
end
directive_start, directive_end, string_length
end
end
"""
strip_argument(astring, start_position = start(astring), end_position = endof(astring))
Return the substring of `astring` corresponding to the argument from `start_position:end_position`, stripping
leading/trailing whitespace and braces.
"""
function strip_argument(astring, start_position = start(astring), end_position = endof(astring))
if start_position > end_position
SubString(astring, 1, 0)
else
string_length = endof(astring)
if !(1 start_position string_length && 1 end_position string_length)
throw(BoundsError())
else
character_vector = Vector{UInt8}(astring)
if character_vector[end_position] == BRACE_CLOSE
end_position -= 1 # omit brace
while start_position end_position && character_vector[start_position] != BRACE_OPEN
start_position += 1
end
if start_position > end_position
error("malformed argument")
end
start_position += 1 # omit brace
end
while start_position end_position && character_vector[end_position] == SPACE
end_position -= 1
end
while start_position end_position && character_vector[start_position] == SPACE
start_position += 1
end
SubString(astring, start_position, end_position)
end
end
end
# to make replace work for LaTeX directives with our
# custom search function, all we need to do is to define
# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, index)
# returns the range of the directive
struct LaTeXDirectiveSearch; end
function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, index)
start_position, directive_end, argument_end = search_latex_directive(s, index)
if start_position < index
0:-1
else
start_position:argument_end
end
end
###########################################################################
# Unicode substitutions for LaTeX directives
const latex_unicode = Dict(
# accent escapes like `\"u` for `ü`, from the list at
# https://en.wikibooks.org/wiki/LaTeX/Special_Characters
# converted to LaTeX characters (mostly combining marks)
"\\`" => "#1\u0300",
"\\'" => "#1\u0301",
"\\^" => "#1\u0302",
"\\\"" => "#1\u0308",
"\\H" => "#1\u030b",
"\\~" => "#1\u0303",
"\\c" => "#1\u0327",
"\\k" => "#1\u0328",
"\\l" => "\u0142",
"\\=" => "#1\u0304",
"\\b" => "#1\u0331",
"\\." => "#1\u0307",
"\\d" => "#1\u0323",
"\\r" => "#1\u030a",
"\\u" => "#1\u0306",
"\\v" => "#1\u030c",
"\\t" => "#1\u0361", # fixme: u+0361 should go after first char in #1
"\\o" => "\u00f8",
"\\i" => "\u0131",
"\\j" => "\u0237",
# other backslash escapes
"\\\\" => "\\",
"\\{" => "{", "\\}" => "}",
"\\%" => "%",
# "\\\$" => "\$" -- dollar signs will be unescaped in strip_dollars
# We parse {....} quoting as an empty directive:
"" => "#1",
# many other substitutions can be found in
# Base.REPLCompletions.latex_symbols
)
# LaTeX directives converted to Markdown
const markdown_directives = Dict(
"\\emph" => "_#1_",
"\\textit" => "_#1_",
"\\it" => "_#1_",
"\\mathit" => "_#1_",
"\\textbf" => "**#1**",
"\\bf" => "**#1**",
"\\mathbf" => "**#1**",
"\\texttt" => "`#1`",
"\\mathrm" => "#1",
"\\url" => "[#1](#1)",
"\\sout" => "~~#1~~",
"\\st" => "~~#1~~",
"\\cancel" => "~~#1~~",
)
# directives that are stripped when converting
# to text/plain
const text_directives = Dict(
"\\emph" => "#1",
"\\textit" => "#1",
"\\it" => "#1",
"\\mathit" => "#1",
"\\textbf" => "#1",
"\\bf" => "#1",
"\\mathbf" => "#1",
"\\texttt" => "#1",
"\\mathrm" => "#1",
"\\url" => "#1",
)
# Unicode includes an incomplete set of super/subscript characters:
const superscripts = Dict(
'0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹',
'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ',
'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ',
'r'=>'ʳ', 's'=>'ˢ', 't'=>'ᵗ', 'u'=>'ᵘ', 'v'=>'ᵛ', 'w'=>'ʷ', 'x'=>'ˣ', 'y'=>'ʸ', 'z'=>'ᶻ',
'A'=>'ᴬ', 'B'=>'ᴮ', 'C'=>'ᶜ', 'D'=>'ᴰ', 'E'=>'ᴱ', 'G'=>'ᴳ', 'H'=>'ᴴ', 'I'=>'ᴵ', 'J'=>'ᴶ',
'K'=>'ᴷ', 'L'=>'ᴸ', 'M'=>'ᴹ', 'N'=>'ᴺ', 'O'=>'ᴼ', 'P'=>'ᴾ', 'R'=>'ᴿ', 'S'=>'ˢ', 'T'=>'ᵀ',
'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ',
'+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', '∘'=>'°',
)
const subscripts = Dict(
'0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉',
'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ',
'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ',
'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ',
'-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ',
)
function replace_characters(astring, character_map)
buffer = IOBuffer()
for character in astring
mapped_character = get(character_map, character, '\0')
if mapped_character == '\0'
return ""
end
print(buffer, mapped_character)
end
String(take!(buffer))
end
# Given a (sub)string `s` that represents a LaTeX directive matched
# by search_latex_directive, performs our Unicode substitutions and
# also any additional substitutions given by extra_directives.
function directive_substitution(astring, extra_directives)
start_position, directive_end, argument_end = search_latex_directive(astring)
string_length = endof(astring)
directive = SubString(astring, start_position, directive_end)
for dict in (extra_directives, latex_unicode, Base.REPLCompletions.latex_symbols)
if haskey(dict, directive)
substitution = dict[directive]
if contains(substitution, "#1")
argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives))
return replace(substitution, "#1", argument)
else
argument = replace_directives(SubString(astring, directive_end+1, string_length), extra_directives)
if strwidth(substitution) == 0 # \hat{...} etc: combining chars go after argument
return string(strip_argument(argument), substitution)
else
return string(substitution, argument) # don't strip for 0-arg macros
end
end
end
end
if directive == "^" || directive == "_" # super/subscripts
argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives))
dict = if directive == "^"
superscripts
else
subscripts
end
substitution = replace_characters(argument, dict)
if !isempty(substitution)
return substitution
end
end
astring # ignore unrecognized directives
end
# replace all latex directives in `s` via `directive_substitution`
replace_directives(astring, extra_directives) =
replace(astring, LaTeXDirectiveSearch(), substitution -> directive_substitution(substitution, extra_directives))
# strip unescaped $ signs from s
function strip_dollars(astring)
buffer = IOBuffer()
character_vector = Vector{UInt8}(astring)
for index = 1:sizeof(astring)
character = character_vector[index]
if character == BACKSLASH && index < sizeof(astring) && character_vector[index + 1] == DOLLAR
write(buffer, DOLLAR) # \$ -> $
elseif character != DOLLAR
write(buffer, character)
end
end
return String(take!(buffer))
end
"""
simplify_latex(astring, extra_directives)
Simplify a LaTeX string `astring` into "plain text" if possible, stripping/converting
known LaTeX directives in favor of e.g Unicode.
`extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives
to replacements. It defaults to `BibTeX.text_directives`, which simply strips
out things like bold and italics. Alternatively, you can pass `BibTeX.markdown_directives`,
which uses Markdown syntax for such directives.
"""
simplify_latex(astring, extra_directives = text_directives) =
strip_dollars(replace_directives(astring, extra_directives))

218
src/parser.jl Normal file
View File

@ -0,0 +1,218 @@
mutable struct Parser{T}
tokens::T
substitutions::Dict{String, String}
records::Dict{String, Dict{String, String}}
line::Int
bracket_counter::Int
end
Base.eltype(p::Parser) = eltype(p.tokens)
Base.one(p::Parser) = eltype(p)("")
Parser(tokens::T, substitutions, records, line, bracket_counter) where T =
Parser{T}(tokens, substitutions, records, line, bracket_counter)
Parser(tokens) = Parser(tokens, Dict{String, String}(), Dict{String, Dict{String, String}}(), 1, 0)
parse_text(text) = matchall(r"[^\s\"#{}@,=\\]+|\s+|\"|#|{|}|@|,|=|\\", text) |> Parser
location(parser) = "on line $(parser.line)"
Base.isempty(p::Parser) = isempty(p.tokens)
next_token_default!(parser) =
if isempty(parser.tokens)
one(parser)
else
result = shift!(parser.tokens)
parser.line = parser.line + count(x -> x == '\n', result)
if all(isspace, result)
eltype(parser)(" ")
else
result
end
end
next_token_with_space!(parser, eol = "additional tokens") = begin
result = next_token_default!(parser)
if result == ""
error("Expected $eol $(location(parser))")
else
result
end
end
next_token!(parser, eol = "additional tokens") = begin
result = next_token_with_space!(parser, eol)
if all(isspace, result)
next_token_with_space!(parser, eol)
else
result
end
end
expect(parser, result, eol) =
if result != eol
error("Expected $eol $(location(parser))")
end
expect!(parser, eol) =
expect(parser, next_token!(parser, eol), eol)
token_and_counter!(parser, eol = "}") = begin
token = next_token_with_space!(parser, eol)
if token == "{"
parser.bracket_counter += 1
elseif token == "}"
parser.bracket_counter -= 1
end
if parser.bracket_counter < 0
error("} without corresponding { $(location(parser))")
else
token
end
end
value!(parser, values = eltype(parser)[]) = begin
token = next_token!(parser)
if token == "\""
token = token_and_counter!(parser, "\"")
while !(token == "\"" && parser.bracket_counter == 0)
push!(values, token)
token = token_and_counter!(parser, "\" or }")
end
elseif token == "{"
parser.bracket_counter += 1
token = token_and_counter!(parser)
while parser.bracket_counter > 0
push!(values, token)
token = token_and_counter!(parser)
end
else
push!(values, getkey(parser.substitutions, token, String(token) ) )
end
token = next_token!(parser, ", or }")
if token == "#"
push!(values, " ")
value!(parser, values)
else
token, join(values)
end
end
field!(parser, dict) = begin
token = ","
while token == ","
token = next_token!(parser, "a new entry or }")
if token != "}"
key = lowercase(token)
if haskey(dict, key)
error("Duplicated field $key $(location(parser))")
else
expect!(parser, "=")
token, dict[key] = value!(parser)
end
end
end
expect(parser, token, "}")
end
"""
parse_bibtex(text)
This is a simple input parser for BibTex. I had trouble finding a standard
specification, but I've included several features of real BibTex. Returns
a preamble (or an empty string) and a dict of dicts.
```jldoctest
julia> using BibTeX: parse_bibtex
julia> preamble, result = parse_bibtex(""\"
@preamble{some instructions}
@comment blah blah
@string{short = long}
@a{b,
c = {{c} c},
d = "d {"} d",
e = f # short
}
""\");
julia> preamble
"some instructions"
julia> result["b"]["__type__"]
"a"
julia> result["b"]["c"]
"{c} c"
julia> result["b"]["d"]
"d {\\"} d"
julia> result["b"]["e"]
"f short"
julia> parse_bibtex("@book")
ERROR: Expected { on line 1
[...]
julia> parse_bibtex("@book@")
ERROR: Expected { on line 1
[...]
```
Repeated fields and keys are not allowed:
```jldoctest
julia> using BibTeX: parse_bibtex
julia> parse_bibtex(""\"
@book{abook,
title = A}
@book{abook,
title = B}
""\")
ERROR: Duplicated id abook on line 3
[...]
julia> parse_bibtex(""\"
@book{abook,
title = A,
title = B}
""\")
ERROR: Duplicated field title on line 3
[...]
```
"""
parse_bibtex(text) = begin
parser = parse_text(text)
token = next_token_default!(parser)
preamble = ""
while token != ""
if token == "@"
record_type = lowercase(next_token!(parser))
if record_type == "preamble"
trash, preamble = value!(parser)
elseif record_type != "comment"
expect!(parser, "{")
if record_type == "string"
field!(parser, parser.substitutions)
else
id = next_token!(parser)
records = parser.records
if haskey(records, id)
error("Duplicated id $id $(location(parser))")
else
dict = Dict("__type__" => record_type)
expect!(parser, ",")
field!(parser, dict)
records[id] = dict
end
end
end
end
token = next_token_default!(parser)
end
preamble, parser.records
end

1
test/REQUIRE Normal file
View File

@ -0,0 +1 @@
Documenter

6
test/benchmark.jl Normal file
View File

@ -0,0 +1,6 @@
const file = joinpath((@__FILE__) |> dirname |> dirname, "example", "examples.bib") |> readstring
using BenchmarkTools
using BibTeX
@benchmark BibTeX.parse_bibtex(file)

58
test/runtests.jl Normal file
View File

@ -0,0 +1,58 @@
using BibTeX, Base.Test
base_file = dirname(dirname(@__FILE__))
import Documenter
Documenter.makedocs(
modules = [BibTeX],
format = :html,
sitename = "BibTeX.jl",
root = joinpath(base_file, "docs"),
pages = Any["Home" => "index.md"],
strict = true,
linkcheck = true,
checkdocs = :exports,
authors = "Brandon Taylor"
)
@testset "examples.bib" begin
# note: ".." does not work on windows
b = open(Bibliography, joinpath(base_file, "example", "examples.bib"), "r")
@test length(b) == 92
@test (b["angenendt"]::Citation{:article})["date"] == "2002"
end
@testset "small bib" begin
b = Bibliography("""
@article{foo, bar=baz}
@book{bar, foobar=1}
""")
@test get(b, "foobar", nothing) === nothing
@test get(b["foo"], "blah", nothing) === nothing
@test string(b["foo"]) == "Citation{:article}(1 entries)"
Base.rehash!(b)
b2 = copy(b)
@test length(b2) == length(b)
@test isempty(sizehint!(empty!(b2),10))
@test isempty(similar(b))
b2["x"] = Citation{:foo}()
b2["x"]["bar"] = "blah"
@test length(b2) == length(b2["x"]) == 1
@test b2["x"]["bar"] == "blah"
@test get(b2["x"], "foo", nothing) === nothing
@test collect(b2)[1][2] == b2["x"]
@test collect(b2["x"])[1] == ("bar"=>"blah")
Base.rehash!(b2["x"])
x2 = copy(b2["x"])::Citation{:foo}
@test length(x2) == 1
@test isempty(similar(x2))
@test isempty(sizehint!(empty!(x2),10))
end
import BibTeX: simplify_latex, markdown_directives
@testset "latex" begin
@test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", markdown_directives) ==
"foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û"
end