From 9f30c134cfb42ad7016cbe843a00996f7dd5acf1 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Fri, 7 Feb 2020 08:31:00 -0500 Subject: [PATCH] allow more than strings --- src/StringDistances.jl | 10 +++++++--- src/edit.jl | 9 ++++++--- src/utils.jl | 15 +++++++++------ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 29faaaa..88eec10 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -17,9 +17,13 @@ include("find.jl") ## ############################################################################## -evaluate(::StringDistance, ::Missing, ::AbstractString) = missing -evaluate(::StringDistance, ::AbstractString, ::Missing) = missing -evaluate(::StringDistance, ::Missing, ::Missing) = missing +evaluate(::QGramDistance, ::Missing, ::AbstractString) = missing +evaluate(::QGramDistance, ::AbstractString, ::Missing) = missing + +evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing +evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing + + compare(::Missing, ::AbstractString, ::StringDistance; min_score = 0.0) = missing compare(::AbstractString, ::Missing, ::StringDistance; min_score = 0.0) = missing diff --git a/src/edit.jl b/src/edit.jl index 454e59a..17414be 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -15,7 +15,8 @@ struct Jaro <: StringDistance end ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html -function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) +function evaluate(dist::Jaro, s1, s2) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) # if both are empty, m = 0 so should be 1.0 according to wikipedia. @@ -85,7 +86,8 @@ struct Levenshtein <: StringDistance end # Return max_dist +1 if distance higher than max_dist # This makes it possible to differentiate distance equalt to max_dist vs strictly higher # This is important for find_all -function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max_dist = nothing) +function evaluate(dist::Levenshtein, s1, s2; max_dist = nothing) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 @@ -138,7 +140,8 @@ required to change one string into the other. struct DamerauLevenshtein <: StringDistance end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html -function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString; max_dist = nothing) +function evaluate(dist::DamerauLevenshtein, s1, s2; max_dist = nothing) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 diff --git a/src/utils.jl b/src/utils.jl index c6000c9..e5e45d5 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -10,17 +10,20 @@ Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n) Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s) Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i) + function reorder(s1::AbstractString, s2::AbstractString) s1 = string_with_length(s1) s2 = string_with_length(s2) - if length(s1) <= length(s2) - return s1, s2 - else - return s2, s1 - end + (length(s1) <= length(s2)) ? (s1, s2) : (s2, s1) end -function common_prefix(s1::AbstractString, s2::AbstractString) +function reorder(s1, s2) + (length(s1) <= length(s2)) ? (s1, s2) : (s2, s1) +end + + + +function common_prefix(s1, s2) x1 = iterate(s1) x2 = iterate(s2) l = 0