Ignore:
Timestamp:
2016-12-09T09:23:47+01:00 (8 years ago)
Author:
simon04
Message:

JOSM/wikipedia: Resolve Wikipedia redirects when failing Wikidata lookup (patch by nyurik, modified) - fix #josm14001

Location:
applications/editors/josm/plugins/wikipedia
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • applications/editors/josm/plugins/wikipedia/src/org/wikipedia/WikipediaApp.java

    r33076 r33098  
    2121import java.util.function.Function;
    2222import java.util.regex.Pattern;
     23import java.util.stream.Collector;
    2324import java.util.stream.Collectors;
    2425import java.util.stream.Stream;
     
    229230                .values()
    230231                .stream()
    231                 .flatMap(chunk -> getWikidataForArticles0(chunk).entrySet().stream())
     232                .flatMap(chunk -> resolveWikidataItems(chunk).entrySet().stream())
    232233                .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    233234    }
    234235
    235     private Map<String, String> getWikidataForArticles0(List<String> articles) {
     236    /**
     237     * Get Wikidata IDs. For any unknown IDs, resolve them (normalize and get redirects),
     238     * and try getting Wikidata IDs again
     239     */
     240    private Map<String, String> resolveWikidataItems(Collection<String> articles) {
     241        final Map<String, String> result = getWikidataForArticles0(articles);
     242        final List<String> unresolved = articles.stream()
     243                .filter(title -> !result.containsKey(title))
     244                .collect(Collectors.toList());
     245        if (!unresolved.isEmpty()) {
     246            final Map<String, String> redirects = resolveRedirectsForArticles(unresolved);
     247            final Map<String, String> result2 = getWikidataForArticles0(redirects.values());
     248            redirects.forEach((original, resolved) -> {
     249                if (result2.containsKey(resolved)) {
     250                    result.put(original, result2.get(resolved));
     251                }
     252            });
     253        }
     254        return result;
     255    }
     256
     257    private Map<String, String> getWikidataForArticles0(Collection<String> articles) {
    236258        if (articles.isEmpty()) {
    237259            return Collections.emptyMap();
     
    257279            }
    258280            return r;
     281        } catch (Exception ex) {
     282            throw new RuntimeException(ex);
     283        }
     284    }
     285
     286    /**
     287     * Given a list of wikipedia titles, returns a map of corresponding normalized title names,
     288     * or if the title is a redirect page, the result is the redirect target.
     289     */
     290    private Map<String, String> resolveRedirectsForArticles(Collection<String> articles) {
     291        try {
     292            final String url = getSiteUrl() + "/w/api.php" +
     293                    "?action=query" +
     294                    "&redirects" +
     295                    "&format=xml" +
     296                    "&titles=" + articles.stream().map(Utils::encodeUrl).collect(Collectors.joining("|"));
     297            try (final InputStream in = connect(url).getContent()) {
     298                final Document xml = newDocumentBuilder().parse(in);
     299
     300                // Add both redirects and normalization results to the same map
     301                final Collector<Node, ?, Map<String, String>> fromToCollector = Collectors.toMap(
     302                        node -> X_PATH.evaluateString("./@from", node),
     303                        node -> X_PATH.evaluateString("./@to", node)
     304                );
     305                final Map<String, String> normalized = X_PATH.evaluateNodes("//normalized/n", xml)
     306                        .stream()
     307                        .collect(fromToCollector);
     308                final Map<String, String> redirects = X_PATH.evaluateNodes("//redirects/r", xml)
     309                        .stream()
     310                        .collect(fromToCollector);
     311                // We should only return those keys that were originally requested, excluding titles that are both normalized and redirected
     312                return articles.stream()
     313                        .collect(Collectors.toMap(Function.identity(), title -> {
     314                                    final String normalizedTitle = normalized.getOrDefault(title, title);
     315                                    return redirects.getOrDefault(normalizedTitle, normalizedTitle);
     316                                }
     317                        ));
     318            }
    259319        } catch (Exception ex) {
    260320            throw new RuntimeException(ex);
  • applications/editors/josm/plugins/wikipedia/test/unit/org/wikipedia/WikipediaAppTest.java

    r33075 r33098  
    119119
    120120    @Test
     121    public void testGetWikidataForArticlesResolveRedirects() throws Exception {
     122        final Map<String, String> map = WikipediaApp.forLanguage("en")
     123                .getWikidataForArticles(Arrays.asList("einstein", "USA"));
     124        assertThat(map.get("einstein"), is("Q937"));
     125        assertThat(map.get("USA"), is("Q30"));
     126        assertThat(map.size(), is(2));
     127    }
     128
     129    @Test
    121130    public void testTicket13991() throws Exception {
    122131        final Map<String, String> map = WikipediaApp.forLanguage("en")
Note: See TracChangeset for help on using the changeset viewer.