StringDistances.jl/src/utils.jl

83 lines
2.5 KiB
Julia
Raw Normal View History

2020-02-11 12:58:15 +01:00
##############################################################################
##
## Some things about Strings
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector).
# Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str
# multiplied by the size, in bytes, of one code unit in str.
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are
# less than N characters, return ncodeunits(str) + (N - length(s))
##############################################################################
2019-12-13 16:33:06 +01:00
# This type allows to compute length once and for all
struct StringWithLength{T <: AbstractString} <: AbstractString
2019-08-19 19:54:38 +02:00
s::T
l::Int
end
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
2020-02-13 15:44:27 +01:00
# Not really needed but avoid multi-encapsulation
string_with_length(s::StringWithLength) = s
2019-08-19 19:54:38 +02:00
Base.length(s::StringWithLength) = s.l
2019-08-20 20:15:05 +02:00
Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
2019-08-19 19:54:38 +02:00
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
2019-12-12 15:38:20 +01:00
2020-02-07 14:31:00 +01:00
2019-08-19 19:54:38 +02:00
function reorder(s1::AbstractString, s2::AbstractString)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
2020-02-07 14:31:00 +01:00
(length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
end
function reorder(s1, s2)
(length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
end
2019-08-17 17:40:26 +02:00
2020-02-07 14:31:00 +01:00
function common_prefix(s1, s2)
2019-12-13 16:33:06 +01:00
l = 0
2020-02-18 14:18:45 +01:00
for (ch1, ch2) in zip(s1, s2)
2019-08-17 17:40:26 +02:00
ch1 != ch2 && break
2019-12-13 16:33:06 +01:00
l += 1
2019-08-17 17:40:26 +02:00
end
2020-02-18 14:18:45 +01:00
return l
end
function _take(s, n::Integer)
Base.Iterators.take(s, n)
end
function _take(s::AbstractString, n::Integer)
2020-02-16 17:12:31 +01:00
StringWithLength(SubString(s, firstindex(s), nextind(s, 0, n)), n)
end
function _drop(s, n::Integer)
Base.Iterators.drop(s, n)
end
2020-02-18 14:18:45 +01:00
function _drop(s::AbstractString, n::Integer)
2020-02-18 14:18:45 +01:00
SubString(s, nextind(s, 0, n + 1), lastindex(s))
end
function _drop(s::StringWithLength, n::Integer)
2020-02-16 17:12:31 +01:00
StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
end
function _slice(s, n1::Integer, n2::Integer)
Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1)
end
function _slice(s::AbstractString, n1::Integer, n2::Integer)
SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2))
end