1 | package com.drew.metadata.iptc;
|
---|
2 |
|
---|
3 | import com.drew.lang.annotations.NotNull;
|
---|
4 | import com.drew.lang.annotations.Nullable;
|
---|
5 |
|
---|
6 | import java.nio.ByteBuffer;
|
---|
7 | import java.nio.charset.CharacterCodingException;
|
---|
8 | import java.nio.charset.Charset;
|
---|
9 | import java.nio.charset.CharsetDecoder;
|
---|
10 |
|
---|
11 | public final class Iso2022Converter
|
---|
12 | {
|
---|
13 | private static final String ISO_8859_1 = "ISO-8859-1";
|
---|
14 | private static final String UTF_8 = "UTF-8";
|
---|
15 |
|
---|
16 | private static final byte LATIN_CAPITAL_A = 0x41;
|
---|
17 | private static final int DOT = 0xe280a2;
|
---|
18 | private static final byte LATIN_CAPITAL_G = 0x47;
|
---|
19 | private static final byte PERCENT_SIGN = 0x25;
|
---|
20 | private static final byte ESC = 0x1B;
|
---|
21 |
|
---|
22 | /**
|
---|
23 | * Converts the given ISO2022 char set to a Java charset name.
|
---|
24 | *
|
---|
25 | * @param bytes string data encoded using ISO2022
|
---|
26 | * @return the Java charset name as a string, or <code>null</code> if the conversion was not possible
|
---|
27 | */
|
---|
28 | @Nullable
|
---|
29 | public static String convertISO2022CharsetToJavaCharset(@NotNull final byte[] bytes)
|
---|
30 | {
|
---|
31 | if (bytes.length > 2 && bytes[0] == ESC && bytes[1] == PERCENT_SIGN && bytes[2] == LATIN_CAPITAL_G)
|
---|
32 | return UTF_8;
|
---|
33 |
|
---|
34 | if (bytes.length > 3 && bytes[0] == ESC && (bytes[3] & 0xFF | ((bytes[2] & 0xFF) << 8) | ((bytes[1] & 0xFF) << 16)) == DOT && bytes[4] == LATIN_CAPITAL_A)
|
---|
35 | return ISO_8859_1;
|
---|
36 |
|
---|
37 | return null;
|
---|
38 | }
|
---|
39 |
|
---|
40 | /**
|
---|
41 | * Attempts to guess the encoding of a string provided as a byte array.
|
---|
42 | * <p/>
|
---|
43 | * Encodings trialled are, in order:
|
---|
44 | * <ul>
|
---|
45 | * <li>UTF-8</li>
|
---|
46 | * <li><code>System.getProperty("file.encoding")</code></li>
|
---|
47 | * <li>ISO-8859-1</li>
|
---|
48 | * </ul>
|
---|
49 | * <p/>
|
---|
50 | * Its only purpose is to guess the encoding if and only if iptc tag coded character set is not set. If the
|
---|
51 | * encoding is not UTF-8, the tag should be set. Otherwise it is bad practice. This method tries to
|
---|
52 | * workaround this issue since some metadata manipulating tools do not prevent such bad practice.
|
---|
53 | * <p/>
|
---|
54 | * About the reliability of this method: The check if some bytes are UTF-8 or not has a very high reliability.
|
---|
55 | * The two other checks are less reliable.
|
---|
56 | *
|
---|
57 | * @param bytes some text as bytes
|
---|
58 | * @return the name of the encoding or null if none could be guessed
|
---|
59 | */
|
---|
60 | @Nullable
|
---|
61 | static String guessEncoding(@NotNull final byte[] bytes)
|
---|
62 | {
|
---|
63 | String[] encodings = { UTF_8, System.getProperty("file.encoding"), ISO_8859_1 };
|
---|
64 |
|
---|
65 | for (String encoding : encodings)
|
---|
66 | {
|
---|
67 | CharsetDecoder cs = Charset.forName(encoding).newDecoder();
|
---|
68 |
|
---|
69 | try {
|
---|
70 | cs.decode(ByteBuffer.wrap(bytes));
|
---|
71 | return encoding;
|
---|
72 | } catch (CharacterCodingException e) {
|
---|
73 | // fall through...
|
---|
74 | }
|
---|
75 | }
|
---|
76 |
|
---|
77 | // No encodings succeeded. Return null.
|
---|
78 | return null;
|
---|
79 | }
|
---|
80 |
|
---|
81 | private Iso2022Converter()
|
---|
82 | {}
|
---|
83 | }
|
---|