Changeset 7848 in josm for trunk/src


Ignore:
Timestamp:
2014-12-20T02:55:42+01:00 (10 years ago)
Author:
Don-vip
Message:

fix #3733 - SimilarNamedWays naïvely uses Levenshtein distance and marks a lot of false positives (patch by mdk, brianegge, modified). Rules still need to be stored in JOSM preferences instead of current hardcoding.

Location:
trunk/src/org/openstreetmap/josm/data/validation
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/org/openstreetmap/josm/data/validation/TestError.java

    r7005 r7848  
    327327    @Override
    328328    public String toString() {
    329         return "TestError [tester=" + tester + ", code=" + code + "]";
     329        return "TestError [tester=" + tester + ", code=" + code + ", message=" + message + "]";
    330330    }
    331331}
  • trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java

    r7005 r7848  
    22package org.openstreetmap.josm.data.validation.tests;
    33
     4import static java.util.regex.Pattern.CASE_INSENSITIVE;
     5import static java.util.regex.Pattern.UNICODE_CASE;
    46import static org.openstreetmap.josm.tools.I18n.tr;
    57
    68import java.awt.geom.Point2D;
    79import java.util.ArrayList;
     10import java.util.Arrays;
    811import java.util.HashMap;
    912import java.util.List;
    1013import java.util.Map;
     14import java.util.regex.Matcher;
     15import java.util.regex.Pattern;
    1116
    1217import org.openstreetmap.josm.data.osm.OsmPrimitive;
     
    3540    private MultiMap<Way, Way> errorWays;
    3641
     42    private ArrayList<NormalizeRule> rules = new ArrayList<NormalizeRule>();
     43
    3744    /**
    3845     * Constructor
     
    4148        super(tr("Similarly named ways"),
    4249                tr("This test checks for ways with similar names that may have been misspelled."));
     50
     51        // FIXME: hardcode these rules for now. Replace them with preferences later
     52        // See https://josm.openstreetmap.de/ticket/3733#comment:19
     53        addRegExprRule("\\d+", "0"); // Highway 66
     54        addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
     55        addRegExprRule("^[A-Z] ", "X"); // E Street
     56        addSynonyms("east", "west", "north", "south");
     57        addSynonyms("first", "second", "third");
    4358    }
    4459
     
    7893                }
    7994
    80                 int levenshteinDistance = getLevenshteinDistance(name, name2);
    81                 if (0 < levenshteinDistance && levenshteinDistance <= 2) {
     95                if (similaryName(name, name2)) {
    8296                    List<OsmPrimitive> primitives = new ArrayList<>(2);
    8397                    primitives.add(w);
     
    98112     * @return The distance between words
    99113     */
    100     public int getLevenshteinDistance(String s, String t) {
     114    public static int getLevenshteinDistance(String s, String t) {
    101115        int[][] d; // matrix
    102116        int n; // length of s
     
    150164        return d[n][m];
    151165    }
     166
     167    /**
     168     * Add a regular expression rule.
     169     * @param regExpr the regular expression to search for
     170     * @param replacement a string to replace with, which should match the expression.
     171     */
     172    public void addRegExprRule(String regExpr, String replacement) {
     173        rules.add(new RegExprRule(regExpr, replacement));
     174    }
     175
     176    /**
     177     * Add a rule with synonym words.
     178     * @param words words which are synonyms
     179     */
     180    public void addSynonyms(String... words) {
     181        for (String word : words) {
     182            rules.add(new SynonymRule(word, words));
     183        }
     184    }
     185
     186    /**
     187     * Check if two names are similar, but not identical. First both names will be "normalized".
     188     * Afterwards the Levenshtein distance will be calculated.<br>
     189     * Examples for normalization rules:<br>
     190     * <code>replaceAll("\\d+", "0")</code><br>
     191     * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
     192     * @param name first name to compare
     193     * @param name2 second name to compare
     194     * @return true if the normalized names are different but only a "little bit"
     195     */
     196    public boolean similaryName(String name, String name2) {
     197        // check plain strings
     198        int distance = getLevenshteinDistance(name, name2);
     199        boolean similar = distance>0 && distance<=2;
     200
     201        // try all rules
     202        for (NormalizeRule rule : rules) {
     203            int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
     204            if (levenshteinDistance == 0)
     205                // one rule results in identical names: identical
     206                return false;
     207            else if (levenshteinDistance <= 2) {
     208                // 0 < distance <= 2
     209                similar = true;
     210            }
     211        }
     212        return similar;
     213    }
     214
     215    public interface NormalizeRule {
     216
     217        /**
     218         * Normalize the string by replacing parts.
     219         * @param name name to normalize
     220         * @return normalized string
     221         */
     222        String normalize(String name);
     223    }
     224
     225    public class RegExprRule implements NormalizeRule {
     226        private final Pattern regExpr;
     227        private final String replacement;
     228
     229        public RegExprRule(String expression, String replacement) {
     230            this.regExpr = Pattern.compile(expression);
     231            this.replacement = replacement;
     232        }
     233
     234        @Override
     235        public String normalize(String name) {
     236            return regExpr.matcher(name).replaceAll(replacement);
     237        }
     238
     239        @Override
     240        public String toString() {
     241            return "replaceAll(" + regExpr + ", " + replacement + ")";
     242        }
     243    }
     244
     245    public class SynonymRule implements NormalizeRule {
     246
     247        private final String[] words;
     248        private final Pattern regExpr;
     249        private final String replacement;
     250
     251        public SynonymRule(String replacement, String[] words) {
     252            this.replacement = replacement.toLowerCase();
     253            this.words = words;
     254
     255            // build regular expression for other words (for fast match)
     256            StringBuilder expression = new StringBuilder();
     257            int maxLength = 0;
     258            for (int i = 0; i < words.length; i++) {
     259                if (words[i].length() > maxLength) {
     260                    maxLength = words[i].length();
     261                }
     262                if (expression.length() > 0) {
     263                    expression.append("|");
     264                }
     265                expression.append(Pattern.quote(words[i]));
     266            }
     267            this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
     268        }
     269
     270        @Override
     271        public String normalize(String name) {
     272            // find first match
     273            Matcher matcher = regExpr.matcher(name);
     274            if (!matcher.find())
     275                return name;
     276
     277            int start = matcher.start();
     278
     279            // which word matches?
     280            String part = "";
     281            for (int i = 0; i < words.length; i++) {
     282                String word = words[i];
     283                part = name.substring(start, start + word.length());
     284                if (word.equalsIgnoreCase(part)) {
     285                    break;
     286                }
     287            }
     288
     289            // replace the word
     290            char[] newName = matcher.replaceFirst(replacement).toCharArray();
     291
     292            // adjust case (replacement is not shorter than matching word!)
     293            int minLength = Math.min(replacement.length(), part.length());
     294            for (int i = 0; i < minLength; i++) {
     295                if (Character.isUpperCase(part.charAt(i))) {
     296                    newName[start + i] = Character.toUpperCase(newName[start + i]);
     297                }
     298            }
     299
     300            return new String(newName);
     301        }
     302
     303        @Override
     304        public String toString() {
     305            return "synonyms(" + replacement + ", " + Arrays.toString(words) + ")";
     306        }
     307    }
    152308}
Note: See TracChangeset for help on using the changeset viewer.