source: osm/applications/editors/josm/plugins/wikipedia/src/org/wikipedia/WikipediaApp.java@ 32691

Last change on this file since 32691 was 32691, checked in by simon04, 8 years ago

JOSM/wikipedia: switch to Java 8

File size: 24.0 KB
Line 
1// License: GPL. See LICENSE file for details./*
2package org.wikipedia;
3
4import java.io.IOException;
5import java.io.InputStream;
6import java.net.URL;
7import java.nio.charset.StandardCharsets;
8import java.util.AbstractList;
9import java.util.ArrayList;
10import java.util.Arrays;
11import java.util.Collection;
12import java.util.Collections;
13import java.util.HashMap;
14import java.util.List;
15import java.util.Locale;
16import java.util.Map;
17import java.util.Objects;
18import java.util.Scanner;
19import java.util.TreeMap;
20import java.util.regex.Matcher;
21import java.util.regex.Pattern;
22import java.util.stream.Collectors;
23import java.util.stream.Stream;
24
25import javax.xml.parsers.DocumentBuilder;
26import javax.xml.parsers.DocumentBuilderFactory;
27import javax.xml.parsers.ParserConfigurationException;
28import javax.xml.xpath.XPath;
29import javax.xml.xpath.XPathConstants;
30import javax.xml.xpath.XPathExpression;
31import javax.xml.xpath.XPathExpressionException;
32import javax.xml.xpath.XPathFactory;
33
34import org.openstreetmap.josm.Main;
35import org.openstreetmap.josm.data.coor.LatLon;
36import org.openstreetmap.josm.data.osm.OsmPrimitive;
37import org.openstreetmap.josm.data.osm.Tag;
38import org.openstreetmap.josm.tools.AlphanumComparator;
39import org.openstreetmap.josm.tools.CheckParameterUtil;
40import org.openstreetmap.josm.tools.HttpClient;
41import org.openstreetmap.josm.tools.Utils;
42import org.w3c.dom.Document;
43import org.w3c.dom.Node;
44import org.w3c.dom.NodeList;
45
46public final class WikipediaApp {
47
48 public static Pattern WIKIDATA_PATTERN = Pattern.compile("Q\\d+");
49 private static final DocumentBuilder DOCUMENT_BUILDER = newDocumentBuilder();
50 private static final XPath X_PATH = XPathFactory.newInstance().newXPath();
51
52 private WikipediaApp() {
53 }
54
55 static String getMediawikiLocale(Locale locale) {
56 if (!locale.getCountry().isEmpty()) {
57 return locale.getLanguage() + "-" + locale.getCountry().toLowerCase();
58 } else {
59 return locale.getLanguage();
60 }
61 }
62
63 static String getSiteUrl(String wikipediaLang) {
64 if ("wikidata".equals(wikipediaLang)) {
65 return "https://www.wikidata.org";
66 } else {
67 return "https://" + wikipediaLang + ".wikipedia.org";
68 }
69 }
70
71 static List<WikipediaEntry> getEntriesFromCoordinates(String wikipediaLang, LatLon min, LatLon max) {
72 try {
73 // construct url
74 final String url = getSiteUrl(wikipediaLang) + "/w/api.php"
75 + "?action=query"
76 + "&list=geosearch"
77 + "&format=xml"
78 + "&gslimit=500"
79 + "&gsbbox=" + max.lat() + "|" + min.lon() + "|" + min.lat() + "|" + max.lon();
80 // parse XML document
81 final XPathExpression xpathPlacemark = X_PATH.compile("//gs");
82 final XPathExpression xpathName = X_PATH.compile("@title");
83 final XPathExpression xpathLat = X_PATH.compile("@lat");
84 final XPathExpression xpathLon = X_PATH.compile("@lon");
85 try (final InputStream in = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContent()) {
86 final Document doc = DOCUMENT_BUILDER.parse(in);
87 final NodeList nodes = (NodeList) xpathPlacemark.evaluate(doc, XPathConstants.NODESET);
88 final List<WikipediaEntry> entries = new ArrayList<>(nodes.getLength());
89 for (int i = 0; i < nodes.getLength(); i++) {
90 final Node node = nodes.item(i);
91 final String name = xpathName.evaluate(node);
92 final LatLon latLon = new LatLon((
93 (double) xpathLat.evaluate(node, XPathConstants.NUMBER)),
94 (double) xpathLon.evaluate(node, XPathConstants.NUMBER));
95 if ("wikidata".equals(wikipediaLang)) {
96 entries.add(new WikidataEntry(name, null, latLon, null));
97 } else {
98 entries.add(new WikipediaEntry(wikipediaLang, name, name, latLon
99 ));
100 }
101 }
102 if ("wikidata".equals(wikipediaLang)) {
103 final List<WikidataEntry> withLabel = getLabelForWikidata(entries, Locale.getDefault());
104 return new ArrayList<>(withLabel);
105 } else {
106 return entries;
107 }
108 }
109 } catch (Exception ex) {
110 throw new RuntimeException(ex);
111 }
112 }
113
114 static List<WikidataEntry> getWikidataEntriesForQuery(final String languageForQuery, final String query, final Locale localeForLabels) {
115 try {
116 final String url = "https://www.wikidata.org/w/api.php" +
117 "?action=wbsearchentities" +
118 "&language=" + languageForQuery +
119 "&strictlanguage=false" +
120 "&search=" + Utils.encodeUrl(query) +
121 "&limit=50" +
122 "&format=xml";
123 final List<WikidataEntry> r = new ArrayList<>();
124 try (final InputStream in = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContent()) {
125 final Document xml = DOCUMENT_BUILDER.parse(in);
126 final NodeList nodes = (NodeList) X_PATH.compile("//entity").evaluate(xml, XPathConstants.NODESET);
127 final XPathExpression xpathId = X_PATH.compile("@id");
128 for (int i = 0; i < nodes.getLength(); i++) {
129 final Node node = nodes.item(i);
130 final String id = (String) xpathId.evaluate(node, XPathConstants.STRING);
131 r.add(new WikidataEntry(id, null, null, null));
132 }
133 }
134 return getLabelForWikidata(r, localeForLabels);
135 } catch (Exception ex) {
136 throw new RuntimeException(ex);
137 }
138 }
139
140 static List<WikipediaEntry> getEntriesFromCategory(String wikipediaLang, String category, int depth) {
141 try {
142 final String url = "https://tools.wmflabs.org/cats-php/"
143 + "?lang=" + wikipediaLang
144 + "&depth=" + depth
145 + "&cat=" + Utils.encodeUrl(category);
146
147 try (final Scanner scanner = new Scanner(
148 HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContentReader())
149 .useDelimiter("\n")) {
150 final List<WikipediaEntry> entries = new ArrayList<>();
151 while (scanner.hasNext()) {
152 final String article = scanner.next().trim().replace("_", " ");
153 entries.add(new WikipediaEntry(wikipediaLang, article));
154 }
155 return entries;
156 }
157 } catch (IOException ex) {
158 throw new RuntimeException(ex);
159 }
160 }
161
162 static List<WikipediaEntry> getEntriesFromClipboard(final String wikipediaLang) {
163 return Arrays.stream(Utils.getClipboardContent().split("[\\n\\r]+"))
164 .map(x -> new WikipediaEntry(wikipediaLang, x))
165 .collect(Collectors.toList());
166 }
167
168 static void updateWIWOSMStatus(String wikipediaLang, Collection<WikipediaEntry> entries) {
169 Collection<String> articleNames = new ArrayList<>();
170 for (WikipediaEntry i : entries) {
171 articleNames.add(i.wikipediaArticle);
172 }
173 Map<String, Boolean> status = new HashMap<>();
174 if (!articleNames.isEmpty()) {
175 final String url = "https://tools.wmflabs.org/wiwosm/osmjson/getGeoJSON.php?action=check"
176 + "&lang=" + wikipediaLang;
177
178 try {
179 final String requestBody = "articles=" + Utils.encodeUrl(articleNames.stream().collect(Collectors.joining(",")));
180 try (final Scanner scanner = new Scanner(
181 HttpClient.create(new URL(url), "POST").setReasonForRequest("Wikipedia")
182 .setHeader("Content-Type", "application/x-www-form-urlencoded")
183 .setRequestBody(requestBody.getBytes(StandardCharsets.UTF_8))
184 .connect().getContentReader())
185 .useDelimiter("\n")) {
186 while (scanner.hasNext()) {
187 //[article]\t[0|1]
188 final String line = scanner.next();
189 final String[] x = line.split("\t");
190 if (x.length == 2) {
191 status.put(x[0], "1".equals(x[1]));
192 } else {
193 Main.error("Unknown element " + line);
194 }
195 }
196 }
197 } catch (Exception ex) {
198 throw new RuntimeException(ex);
199 }
200 }
201 for (WikipediaEntry i : entries) {
202 i.setWiwosmStatus(status.get(i.wikipediaArticle));
203 }
204 }
205
206 static Stream<String> getWikipediaArticles(final String wikipediaLang, OsmPrimitive p) {
207 if ("wikidata".equals(wikipediaLang)) {
208 return Stream.of(p.get("wikidata")).filter(Objects::nonNull);
209 }
210 final Map<String, String> tags = p.getKeys();
211 return Stream
212 .of(
213 WikipediaLangArticle.parseTag("wikipedia", tags.get("wikipedia")),
214 WikipediaLangArticle.parseTag("wikipedia:" + wikipediaLang, tags.get("wikipedia:" + wikipediaLang))
215 ).filter(Objects::nonNull)
216 .filter(wikipediaLang::equals)
217 .map(wp -> wp.article);
218 }
219
220 /**
221 * Returns a map mapping wikipedia articles to wikidata ids.
222 */
223 static Map<String, String> getWikidataForArticles(String wikipediaLang, List<String> articles) {
224 if (articles.size() > 50) {
225 final Map<String, String> wikidataItems = new HashMap<>();
226 for (final List<String> chunk : partitionList(articles, 50)) {
227 wikidataItems.putAll(getWikidataForArticles(wikipediaLang, chunk));
228 }
229 return wikidataItems;
230 }
231 try {
232 final String url = "https://www.wikidata.org/w/api.php" +
233 "?action=wbgetentities" +
234 "&props=sitelinks" +
235 "&sites=" + wikipediaLang + "wiki" +
236 "&sitefilter=" + wikipediaLang + "wiki" +
237 "&format=xml" +
238 "&titles=" + articles.stream().map(Utils::encodeUrl).collect(Collectors.joining("|"));
239 final Map<String, String> r = new TreeMap<>();
240 try (final InputStream in = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContent()) {
241 final Document xml = DOCUMENT_BUILDER.parse(in);
242 final NodeList nodes = (NodeList) X_PATH.compile("//entity").evaluate(xml, XPathConstants.NODESET);
243 for (int i = 0; i < nodes.getLength(); i++) {
244 final Node node = nodes.item(i);
245 final String wikidata = (String) X_PATH.compile("./@id").evaluate(node, XPathConstants.STRING);
246 final String wikipedia = (String) X_PATH.compile("./sitelinks/sitelink/@title").evaluate(node, XPathConstants.STRING);
247 r.put(wikipedia, wikidata);
248 }
249 }
250 return r;
251 } catch (Exception ex) {
252 throw new RuntimeException(ex);
253 }
254 }
255
256 static List<String> getCategoriesForPrefix(final String wikipediaLang, final String prefix) {
257 try {
258 final String url = getSiteUrl(wikipediaLang) + "/w/api.php"
259 + "?action=query"
260 + "&list=prefixsearch"
261 + "&format=xml"
262 + "&psnamespace=14"
263 + "&pslimit=50"
264 + "&pssearch=" + Utils.encodeUrl(prefix);
265 // parse XML document
266 try (final InputStream in = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContent()) {
267 final Document doc = DOCUMENT_BUILDER.parse(in);
268 final NodeList nodes = (NodeList) X_PATH.compile("//ps/@title").evaluate(doc, XPathConstants.NODESET);
269 final List<String> categories = new ArrayList<>(nodes.getLength());
270 for (int i = 0; i < nodes.getLength(); i++) {
271 final Node node = nodes.item(i);
272 final String value = node.getNodeValue();
273 categories.add(value.contains(":") ? value.split(":", 2)[1] : value);
274 }
275 return categories;
276 }
277 } catch (Exception ex) {
278 throw new RuntimeException(ex);
279 }
280 }
281
282 static String getLabelForWikidata(String wikidataId, Locale locale, String ... preferredLanguage) {
283 try {
284 return getLabelForWikidata(Collections.singletonList(new WikidataEntry(wikidataId, null, null, null)), locale, preferredLanguage).get(0).label;
285 } catch (IndexOutOfBoundsException ignore) {
286 return null;
287 }
288 }
289
290 static List<WikidataEntry> getLabelForWikidata(List<? extends WikipediaEntry> entries, Locale locale, String ... preferredLanguage) {
291 if (entries.size() > 50) {
292 final List<WikidataEntry> entriesWithLabel = new ArrayList<>(entries.size());
293 for (final List<? extends WikipediaEntry> chunk : partitionList(entries, 50)) {
294 entriesWithLabel.addAll(getLabelForWikidata(chunk, locale, preferredLanguage));
295 }
296 return entriesWithLabel;
297 }
298 try {
299 final String url = "https://www.wikidata.org/w/api.php" +
300 "?action=wbgetentities" +
301 "&props=labels|descriptions" +
302 "&ids=" + entries.stream().map(x -> x.wikipediaArticle).collect(Collectors.joining("|")) +
303 "&format=xml";
304 final Collection<String> languages = new ArrayList<>();
305 if (locale != null) {
306 languages.add(getMediawikiLocale(locale));
307 languages.add(getMediawikiLocale(new Locale(locale.getLanguage())));
308 }
309 languages.addAll(Arrays.asList(preferredLanguage));
310 languages.add("en");
311 languages.add(null);
312 final List<WikidataEntry> r = new ArrayList<>(entries.size());
313 try (final InputStream in = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContent()) {
314 final Document xml = DOCUMENT_BUILDER.parse(in);
315 for (final WikipediaEntry entry : entries) {
316 final Node entity = (Node) X_PATH.compile("//entity[@id='" + entry.wikipediaArticle + "']").evaluate(xml, XPathConstants.NODE);
317 r.add(new WikidataEntry(
318 entry.wikipediaArticle,
319 getFirstField(languages, "label", entity),
320 entry.coordinate,
321 getFirstField(languages, "description", entity)
322 ));
323 }
324 }
325 return r;
326 } catch (Exception ex) {
327 throw new RuntimeException(ex);
328 }
329 }
330
331 private static String getFirstField(Iterable<String> languages, String field, Node entity) throws XPathExpressionException {
332 for (String language : languages) {
333 final String label = (String) X_PATH.compile(language != null
334 ? ".//" + field + "[@language='" + language + "']/@value"
335 : ".//" + field + "/@value"
336 ).evaluate(entity, XPathConstants.STRING);
337 if (label != null && !label.isEmpty()) {
338 return label;
339 }
340 }
341 return null;
342 }
343
344 static Collection<WikipediaLangArticle> getInterwikiArticles(String wikipediaLang, String article) {
345 try {
346 Collection<WikipediaLangArticle> r = new ArrayList<>();
347 final String url = getSiteUrl(wikipediaLang) + "/w/api.php" +
348 "?action=query" +
349 "&prop=langlinks" +
350 "&titles=" + Utils.encodeUrl(article) +
351 "&lllimit=500" +
352 "&format=xml";
353 try (final InputStream in = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContent()) {
354 final Document xml = DOCUMENT_BUILDER.parse(in);
355 final NodeList nodes = (NodeList) X_PATH.compile("//ll").evaluate(xml, XPathConstants.NODESET);
356 for (int i = 0; i < nodes.getLength(); i++) {
357 final String lang = nodes.item(i).getAttributes().getNamedItem("lang").getTextContent();
358 final String name = nodes.item(i).getTextContent();
359 r.add(new WikipediaLangArticle(lang, name));
360 }
361 }
362 return r;
363 } catch (Exception ex) {
364 throw new RuntimeException(ex);
365 }
366 }
367
368 static LatLon getCoordinateForArticle(String wikipediaLang, String article) {
369 try {
370 final String url = getSiteUrl(wikipediaLang) + "/w/api.php" +
371 "?action=query" +
372 "&prop=coordinates" +
373 "&titles=" + Utils.encodeUrl(article) +
374 "&format=xml";
375 try (final InputStream in = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect().getContent()) {
376 final Document xml = DOCUMENT_BUILDER.parse(in);
377 final Node node = (Node) X_PATH.compile("//coordinates/co").evaluate(xml, XPathConstants.NODE);
378 if (node == null) {
379 return null;
380 } else {
381 final double lat = Double.parseDouble(node.getAttributes().getNamedItem("lat").getTextContent());
382 final double lon = Double.parseDouble(node.getAttributes().getNamedItem("lon").getTextContent());
383 return new LatLon(lat, lon);
384 }
385 }
386 } catch (Exception ex) {
387 throw new RuntimeException(ex);
388 }
389 }
390
391 static class WikipediaLangArticle {
392
393 final String lang, article;
394
395 public WikipediaLangArticle(String lang, String article) {
396 this.lang = lang;
397 this.article = article;
398 }
399
400 public static WikipediaLangArticle parseFromUrl(String url) {
401 if (url == null) {
402 return null;
403 }
404 // decode URL for nicer value
405 url = Utils.decodeUrl(url);
406 // extract Wikipedia language and
407 final Matcher m = Pattern.compile("(https?:)?//(\\w*)\\.wikipedia\\.org/wiki/(.*)").matcher(url);
408 if (!m.matches()) {
409 return null;
410 }
411 return new WikipediaLangArticle(m.group(2), m.group(3));
412 }
413
414 public static WikipediaLangArticle parseTag(String key, String value) {
415 if (value == null) {
416 return null;
417 } else if (value.startsWith("http")) {
418 //wikipedia=http...
419 return parseFromUrl(value);
420 } else if (value.contains(":")) {
421 //wikipedia=[lang]:[article]
422 //wikipedia:[lang]=[lang]:[article]
423 final String[] item = Utils.decodeUrl(value).split(":", 2);
424 final String article = item[1].replace("_", " ");
425 return new WikipediaLangArticle(item[0], article);
426 } else if (key.startsWith("wikipedia:")) {
427 //wikipedia:[lang]=[lang]:[article]
428 //wikipedia:[lang]=[article]
429 final String lang = key.split(":", 2)[1];
430 final String[] item = Utils.decodeUrl(value).split(":", 2);
431 final String article = item[item.length == 2 ? 1 : 0].replace("_", " ");
432 return new WikipediaLangArticle(lang, article);
433 } else {
434 return null;
435 }
436 }
437
438 @Override
439 public String toString() {
440 return lang + ":" + article;
441 }
442
443 @Override
444 public boolean equals(Object o) {
445 if (this == o) return true;
446 if (o == null || getClass() != o.getClass()) return false;
447 final WikipediaLangArticle that = (WikipediaLangArticle) o;
448 return Objects.equals(lang, that.lang) &&
449 Objects.equals(article, that.article);
450 }
451
452 @Override
453 public int hashCode() {
454 return Objects.hash(lang, article);
455 }
456 }
457
458 static class WikipediaEntry implements Comparable<WikipediaEntry> {
459
460 final String label;
461 final String wikipediaLang, wikipediaArticle;
462 final LatLon coordinate;
463 private Boolean wiwosmStatus;
464
465 WikipediaEntry(String wikipediaLang, String wikipediaArticle) {
466 this(wikipediaLang, wikipediaArticle, null, null);
467 }
468
469 WikipediaEntry(String wikipediaLang, String wikipediaArticle, String label, LatLon coordinate) {
470 this.label = label;
471 this.wikipediaLang = wikipediaLang;
472 this.wikipediaArticle = wikipediaArticle;
473 this.coordinate = coordinate;
474 }
475
476 protected Tag createWikipediaTag() {
477 return new Tag("wikipedia", wikipediaLang + ":" + wikipediaArticle);
478 }
479
480 public void setWiwosmStatus(Boolean wiwosmStatus) {
481 this.wiwosmStatus = wiwosmStatus;
482 }
483
484 public Boolean getWiwosmStatus() {
485 return wiwosmStatus;
486 }
487
488 public String getBrowserUrl() {
489 return getSiteUrl(wikipediaLang) + "/wiki/" + Utils.encodeUrl(wikipediaArticle.replace(" ", "_"));
490 }
491
492 public String getLabelText() {
493 return wikipediaArticle;
494 }
495
496 @Override
497 public String toString() {
498 return wikipediaArticle;
499 }
500
501 @Override
502 public int compareTo(WikipediaEntry o) {
503 final int c = AlphanumComparator.getInstance().compare(label, o.label);
504 return c != 0 ? c : AlphanumComparator.getInstance().compare(wikipediaArticle, o.wikipediaArticle);
505 }
506 }
507
508 static class WikidataEntry extends WikipediaEntry {
509
510 final String description;
511
512 WikidataEntry(String id, String label, LatLon coordinate, String description) {
513 super("wikidata", id, label, coordinate);
514 this.description = description;
515 ensureValidWikidataId(id);
516 }
517
518 @Override
519 protected Tag createWikipediaTag() {
520 return new Tag("wikidata", wikipediaArticle);
521 }
522
523 @Override
524 public String getLabelText() {
525 final String descriptionInParen = description == null ? "" : (" (" + description + ")");
526 return getLabelText(label, wikipediaArticle + descriptionInParen);
527 }
528
529 static String getLabelText(String bold, String gray) {
530 return Utils.escapeReservedCharactersHTML(bold) + " <span color='gray'>" + Utils.escapeReservedCharactersHTML(gray) + "</span>";
531 }
532 }
533
534 static void ensureValidWikidataId(String id) {
535 CheckParameterUtil.ensureThat(WIKIDATA_PATTERN.matcher(id).matches(), "Invalid Wikidata ID given: " + id);
536 }
537
538 public static <T> List<List<T>> partitionList(final List<T> list, final int size) {
539 return new AbstractList<List<T>>() {
540 @Override
541 public List<T> get(int index) {
542 final int fromIndex = index * size;
543 final int toIndex = Math.min(fromIndex + size, list.size());
544 return list.subList(fromIndex, toIndex);
545 }
546
547 @Override
548 public int size() {
549 return (int) Math.ceil(((float) list.size()) / size);
550 }
551 };
552 }
553
554 private static DocumentBuilder newDocumentBuilder() {
555 try {
556 return DocumentBuilderFactory.newInstance().newDocumentBuilder();
557 } catch (ParserConfigurationException e) {
558 Main.warn("Cannot create DocumentBuilder");
559 Main.warn(e);
560 throw new RuntimeException(e);
561 }
562 }
563}
Note: See TracBrowser for help on using the repository browser.