unecessary conditions

2015-11-05 21:03:45 -05:00 · 2015-11-05 21:03:45 -05:00 · 3b25d7b1de
parent 2b41b1fcfa
commit 3b25d7b1de
8 changed files with 88 additions and 85 deletions
--- a/README.md
+++ b/README.md
@ -28,7 +28,6 @@ Q-gram distances compare the set of all substrings of length `q` in each string.
 ## Syntax
 #### evaluate
 The function `evaluate` returns the litteral *distance* between two strings (a value of 0 being identical). While some distances are bounded by 1, other distances like `Hamming`, `Levenshtein`, `Damerau-Levenshtein`,  `Jaccard` can be higher than 1.
-
 ```julia
 using StringDistances
 evaluate(Hamming(), "martha", "marhta")
@ -38,7 +37,7 @@ evaluate(QGram(2), "martha", "marhta")
 ```

 #### compare
-The higher level function `compare` directly computes *a similarity score* between 0 and 1, based on the inverse distance between two strings. A value of 0 being completely different and a value of 1 being completely similar.
+The higher level function `compare` returns *a similarity score* between two strings, based on the inverse of the distance between two strings. The similarity score is always between 0 and 1. A value of 0 being completely different and a value of 1 being completely similar.
 ```julia
 using StringDistances
 compare(Hamming(), "martha", "marhta")
@ -110,7 +109,7 @@ The package defines a number of ways to modify string metrics:

 ## Tips

- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words with fluctuating orderings.
+- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words and fluctuating orderings.
 - Most distances perform poorly when comparing company or individual names, where each string is composed of multiple words.

 	- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different orderings. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
@ -123,7 +122,8 @@ The package defines a number of ways to modify string metrics:
 		compare(Cosine(3), "mariners vs angels", "angels vs mariners")
 		#> 0.8125
 		```
-	- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names first to diminish their importance (ie "bk" "co"). Another solution is to use something like the `Partial` or `TokenSet` modifiers.
+	- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names to diminish their importance (ie "bk", "co"). Another solution is to use the `Overlap` distance, which compares common qgrams to the length of the shorter strings. Another solution is to use the `Partial` modifier or `TokenSet` modifiers. 
+
 - Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)


--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -30,15 +30,72 @@ Partial,
 TokenSort,
 TokenSet

-include("distances/evaluate.jl")
 include("distances/edit.jl")
 include("distances/qgram.jl")
 include("distances/RatcliffObershelp.jl")

-include("modifiers/compare.jl")
 include("modifiers/winkler.jl")
 include("modifiers/tokenize.jl")
 include("modifiers/partial.jl")

+##############################################################################
+##
+## Higher level functions
+##
+##############################################################################
+
+function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
+    len1, len2 = length(s1), length(s2)
+    if len1 > len2
+        return evaluate(dist, s2, s1, len2, len1)
+    else
+        return evaluate(dist, s1, s2, len1, len2)
+    end
+end
+
+##############################################################################
+##
+## compare
+##
+##############################################################################
+
+function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
+    len1, len2 = length(s1), length(s2)
+    if len1 > len2
+        return compare(dist, s2, s1, len2, len1)
+    else
+        return compare(dist, s1, s2, len1, len2)
+    end
+end
+
+function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, 
+    len1::Integer, len2::Integer)
+    1.0 - evaluate(dist, s1, s2, len1, len2)
+end
+
+function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, 
+    s1::AbstractString, s2::AbstractString,
+    len1::Integer, len2::Integer)
+    distance = evaluate(dist, s1, s2, len1, len2)
+    len2 == 0 ? 1.0 : 1.0 - distance / len2
+end
+
+# compare always return a value between 0 and 1. 
+# When string length < q for qgram distance, returns s1 == s2
+function compare(dist::AbstractQGram, 
+    s1::AbstractString, s2::AbstractString, 
+    len1::Integer, len2::Integer)
+    len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
+    evaluate(dist, s1, s2, len1, len2)
+end
+
+function compare(dist::QGram, 
+    s1::AbstractString, s2::AbstractString, 
+    len1::Integer, len2::Integer)
+    len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
+    distance = evaluate(dist, s1, s2, len1, len2)
+    1 - distance / (len1 + len2 - 2 * dist.q + 2)
+end
+

 end 
--- a/src/distances/RatcliffObershelp.jl
+++ b/src/distances/RatcliffObershelp.jl
@ -1,24 +1,28 @@
-# Return a character index, not a byte index
+# Return start of commn substring in s1, start of common substring in s2, and length of substring
+# Indexes refer to character number, not index (differ for Unicode strings)
 function longest_common_substring(s1::AbstractString, s2::AbstractString)
-    len2 = length(s2)
-    start1, start2, size = 0, 0, 0
-    p = zeros(Int, len2)
-    i1 = 0
-    for ch1 in s1
-        i1 += 1
-        i2 = 0
-        oldp = 0
-        for ch2 in s2
-            i2 += 1
-            newp = 0
-            if ch1 == ch2
-                newp = oldp > 0 ? oldp : i2
-                currentlength = (i2 - newp + 1)
-                if currentlength > size
-                    start1, start2, size = i1 - currentlength + 1, newp, currentlength
+    if length(s1) > length(s2)
+        start2, start1, size= longest_common_substring(s2, s1)
+    else
+        start1, start2, size = 0, 0, 0
+        p = zeros(Int, length(s2))
+        i1 = 0
+        for ch1 in s1
+            i1 += 1
+            i2 = 0
+            oldp = 0
+            for ch2 in s2
+                i2 += 1
+                newp = 0
+                if ch1 == ch2
+                    newp = oldp > 0 ? oldp : i2
+                    currentlength = (i2 - newp + 1)
+                    if currentlength > size
+                        start1, start2, size = i1 - currentlength + 1, newp, currentlength
+                    end
                end
+                p[i2], oldp = newp, p[i2]
            end
-            p[i2], oldp = newp, p[i2]
        end
    end
    return start1, start2, size
@ -47,7 +51,6 @@ end

 type RatcliffObershelp <: PreMetric end
 function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
-    len2 == 0 && 0.0
    result = matching_blocks(s1, s2)
    matched = 0
    for x in result
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -43,7 +43,6 @@ end

 type Levenshtein <: SemiMetric end
 function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
-    len2 == 0 && return 0

    # prefix common to both strings can be ignored
    k, start1, start2 = common_prefix(s1, s2)
@ -92,7 +91,6 @@ end
 type DamerauLevenshtein <: SemiMetric end

 function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
-    len2 == 0 && return 0

    # prefix common to both strings can be ignored
    k, start1, start2 = common_prefix(s1, s2)
@ -161,6 +159,7 @@ end
 type Jaro <: SemiMetric end

 function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) 
+    # if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
    len2 == 0 && return 0.0

    maxdist = max(0, div(len2, 2) - 1)
--- a/src/distances/evaluate.jl
+++ b/src/distances/evaluate.jl
@ -1,8 +0,0 @@
-function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
-    len1, len2 = length(s1), length(s2)
-    if len1 > len2
-        return evaluate(dist, s2, s1, len2, len1)
-    else
-        return evaluate(dist, s1, s2, len1, len2)
-    end
-end
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -1,10 +1,3 @@
-##############################################################################
-##
-## Define QGram Distance type
-##
-##############################################################################
-abstract AbstractQGram <: SemiMetric
-
 ##############################################################################
 ##
 ## Define a type that iterates through q-grams of a string
@ -85,6 +78,7 @@ end
 ## Distance on strings is computed by set distance on qgram sets
 ##
 ##############################################################################
+abstract AbstractQGram <: SemiMetric

 function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
 	sort1 = sort(QGramIterator(s1, len1, dist.q))
--- a/src/modifiers/compare.jl
+++ b/src/modifiers/compare.jl
@ -1,42 +0,0 @@
-##############################################################################
-##
-## compare
-##
-##############################################################################
-
-function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
-    len1, len2 = length(s1), length(s2)
-    if len1 > len2
-        return compare(dist, s2, s1, len2, len1)
-    else
-        return compare(dist, s1, s2, len1, len2)
-    end
-end
-
-function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, 
-    len1::Integer, len2::Integer)
-    1.0 - evaluate(dist, s1, s2, len1, len2)
-end
-
-function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, 
-    s1::AbstractString, s2::AbstractString,
-    len1::Integer, len2::Integer)
-    distance = evaluate(dist, s1, s2, len1, len2)
-    len2 == 0 ? 1.0 : 1.0 - distance / len2
-end
-
-# while q gram definition are not modified for smaller string (the set is just considered as empty, which leads to NaN values), compare always returns a Float64 value between 0 and 1
-function compare(dist::AbstractQGram, 
-    s1::AbstractString, s2::AbstractString, 
-    len1::Integer, len2::Integer)
-    len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
-    evaluate(dist, s1, s2, len1, len2)
-end
-
-function compare(dist::QGram, 
-    s1::AbstractString, s2::AbstractString, 
-    len1::Integer, len2::Integer)
-    len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
-    distance = evaluate(dist, s1, s2, len1, len2)
-    1 - distance / (len1 + len2 - 2 * dist.q + 2)
-end
--- a/src/modifiers/winkler.jl
+++ b/src/modifiers/winkler.jl
@ -4,7 +4,7 @@
 ##
 ##############################################################################

-type Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
+immutable Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
    dist::T1
    scaling_factor::T2      # scaling factor. Default to 0.1
    boosting_limit::T3      # boost threshold. Default to 0.7