unecessary conditions

pull/3/head
matthieugomez 2015-11-05 21:03:45 -05:00
parent 2b41b1fcfa
commit 3b25d7b1de
8 changed files with 88 additions and 85 deletions

View File

@ -28,7 +28,6 @@ Q-gram distances compare the set of all substrings of length `q` in each string.
## Syntax
#### evaluate
The function `evaluate` returns the litteral *distance* between two strings (a value of 0 being identical). While some distances are bounded by 1, other distances like `Hamming`, `Levenshtein`, `Damerau-Levenshtein`, `Jaccard` can be higher than 1.
```julia
using StringDistances
evaluate(Hamming(), "martha", "marhta")
@ -38,7 +37,7 @@ evaluate(QGram(2), "martha", "marhta")
```
#### compare
The higher level function `compare` directly computes *a similarity score* between 0 and 1, based on the inverse distance between two strings. A value of 0 being completely different and a value of 1 being completely similar.
The higher level function `compare` returns *a similarity score* between two strings, based on the inverse of the distance between two strings. The similarity score is always between 0 and 1. A value of 0 being completely different and a value of 1 being completely similar.
```julia
using StringDistances
compare(Hamming(), "martha", "marhta")
@ -110,7 +109,7 @@ The package defines a number of ways to modify string metrics:
## Tips
- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words with fluctuating orderings.
- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words and fluctuating orderings.
- Most distances perform poorly when comparing company or individual names, where each string is composed of multiple words.
- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different orderings. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
@ -123,7 +122,8 @@ The package defines a number of ways to modify string metrics:
compare(Cosine(3), "mariners vs angels", "angels vs mariners")
#> 0.8125
```
- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names first to diminish their importance (ie "bk" "co"). Another solution is to use something like the `Partial` or `TokenSet` modifiers.
- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names to diminish their importance (ie "bk", "co"). Another solution is to use the `Overlap` distance, which compares common qgrams to the length of the shorter strings. Another solution is to use the `Partial` modifier or `TokenSet` modifiers.
- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)

View File

@ -30,15 +30,72 @@ Partial,
TokenSort,
TokenSet
include("distances/evaluate.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
include("distances/RatcliffObershelp.jl")
include("modifiers/compare.jl")
include("modifiers/winkler.jl")
include("modifiers/tokenize.jl")
include("modifiers/partial.jl")
##############################################################################
##
## Higher level functions
##
##############################################################################
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return evaluate(dist, s2, s1, len2, len1)
else
return evaluate(dist, s1, s2, len1, len2)
end
end
##############################################################################
##
## compare
##
##############################################################################
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return compare(dist, s2, s1, len2, len1)
else
return compare(dist, s1, s2, len1, len2)
end
end
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare(dist::AbstractQGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)
1 - distance / (len1 + len2 - 2 * dist.q + 2)
end
end

View File

@ -1,24 +1,28 @@
# Return a character index, not a byte index
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index (differ for Unicode strings)
function longest_common_substring(s1::AbstractString, s2::AbstractString)
len2 = length(s2)
start1, start2, size = 0, 0, 0
p = zeros(Int, len2)
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
oldp = 0
for ch2 in s2
i2 += 1
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = (i2 - newp + 1)
if currentlength > size
start1, start2, size = i1 - currentlength + 1, newp, currentlength
if length(s1) > length(s2)
start2, start1, size= longest_common_substring(s2, s1)
else
start1, start2, size = 0, 0, 0
p = zeros(Int, length(s2))
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
oldp = 0
for ch2 in s2
i2 += 1
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = (i2 - newp + 1)
if currentlength > size
start1, start2, size = i1 - currentlength + 1, newp, currentlength
end
end
p[i2], oldp = newp, p[i2]
end
p[i2], oldp = newp, p[i2]
end
end
return start1, start2, size
@ -47,7 +51,6 @@ end
type RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len2 == 0 && 0.0
result = matching_blocks(s1, s2)
matched = 0
for x in result

View File

@ -43,7 +43,6 @@ end
type Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len2 == 0 && return 0
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -92,7 +91,6 @@ end
type DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len2 == 0 && return 0
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -161,6 +159,7 @@ end
type Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)

View File

@ -1,8 +0,0 @@
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return evaluate(dist, s2, s1, len2, len1)
else
return evaluate(dist, s1, s2, len1, len2)
end
end

View File

@ -1,10 +1,3 @@
##############################################################################
##
## Define QGram Distance type
##
##############################################################################
abstract AbstractQGram <: SemiMetric
##############################################################################
##
## Define a type that iterates through q-grams of a string
@ -85,6 +78,7 @@ end
## Distance on strings is computed by set distance on qgram sets
##
##############################################################################
abstract AbstractQGram <: SemiMetric
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
sort1 = sort(QGramIterator(s1, len1, dist.q))

View File

@ -1,42 +0,0 @@
##############################################################################
##
## compare
##
##############################################################################
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return compare(dist, s2, s1, len2, len1)
else
return compare(dist, s1, s2, len1, len2)
end
end
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
end
# while q gram definition are not modified for smaller string (the set is just considered as empty, which leads to NaN values), compare always returns a Float64 value between 0 and 1
function compare(dist::AbstractQGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)
1 - distance / (len1 + len2 - 2 * dist.q + 2)
end

View File

@ -4,7 +4,7 @@
##
##############################################################################
type Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
immutable Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
dist::T1
scaling_factor::T2 # scaling factor. Default to 0.1
boosting_limit::T3 # boost threshold. Default to 0.7