[10862] | 1 | /*
|
---|
| 2 | * Copyright 2002-2016 Drew Noakes
|
---|
| 3 | *
|
---|
| 4 | * Licensed under the Apache License, Version 2.0 (the "License");
|
---|
| 5 | * you may not use this file except in compliance with the License.
|
---|
| 6 | * You may obtain a copy of the License at
|
---|
| 7 | *
|
---|
| 8 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 9 | *
|
---|
| 10 | * Unless required by applicable law or agreed to in writing, software
|
---|
| 11 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
| 13 | * See the License for the specific language governing permissions and
|
---|
| 14 | * limitations under the License.
|
---|
| 15 | *
|
---|
| 16 | * More information about this project is available at:
|
---|
| 17 | *
|
---|
| 18 | * https://drewnoakes.com/code/exif/
|
---|
| 19 | * https://github.com/drewnoakes/metadata-extractor
|
---|
| 20 | */
|
---|
[8132] | 21 | package com.drew.metadata.iptc;
|
---|
| 22 |
|
---|
| 23 | import com.drew.lang.annotations.NotNull;
|
---|
| 24 | import com.drew.lang.annotations.Nullable;
|
---|
| 25 |
|
---|
| 26 | import java.nio.ByteBuffer;
|
---|
| 27 | import java.nio.charset.CharacterCodingException;
|
---|
| 28 | import java.nio.charset.Charset;
|
---|
| 29 | import java.nio.charset.CharsetDecoder;
|
---|
| 30 |
|
---|
| 31 | public final class Iso2022Converter
|
---|
| 32 | {
|
---|
| 33 | private static final String ISO_8859_1 = "ISO-8859-1";
|
---|
| 34 | private static final String UTF_8 = "UTF-8";
|
---|
| 35 |
|
---|
| 36 | private static final byte LATIN_CAPITAL_A = 0x41;
|
---|
| 37 | private static final int DOT = 0xe280a2;
|
---|
| 38 | private static final byte LATIN_CAPITAL_G = 0x47;
|
---|
| 39 | private static final byte PERCENT_SIGN = 0x25;
|
---|
| 40 | private static final byte ESC = 0x1B;
|
---|
| 41 |
|
---|
| 42 | /**
|
---|
| 43 | * Converts the given ISO2022 char set to a Java charset name.
|
---|
| 44 | *
|
---|
| 45 | * @param bytes string data encoded using ISO2022
|
---|
| 46 | * @return the Java charset name as a string, or <code>null</code> if the conversion was not possible
|
---|
| 47 | */
|
---|
| 48 | @Nullable
|
---|
| 49 | public static String convertISO2022CharsetToJavaCharset(@NotNull final byte[] bytes)
|
---|
| 50 | {
|
---|
| 51 | if (bytes.length > 2 && bytes[0] == ESC && bytes[1] == PERCENT_SIGN && bytes[2] == LATIN_CAPITAL_G)
|
---|
| 52 | return UTF_8;
|
---|
| 53 |
|
---|
| 54 | if (bytes.length > 3 && bytes[0] == ESC && (bytes[3] & 0xFF | ((bytes[2] & 0xFF) << 8) | ((bytes[1] & 0xFF) << 16)) == DOT && bytes[4] == LATIN_CAPITAL_A)
|
---|
| 55 | return ISO_8859_1;
|
---|
| 56 |
|
---|
| 57 | return null;
|
---|
| 58 | }
|
---|
| 59 |
|
---|
| 60 | /**
|
---|
| 61 | * Attempts to guess the encoding of a string provided as a byte array.
|
---|
[10862] | 62 | * <p>
|
---|
[8132] | 63 | * Encodings trialled are, in order:
|
---|
| 64 | * <ul>
|
---|
| 65 | * <li>UTF-8</li>
|
---|
| 66 | * <li><code>System.getProperty("file.encoding")</code></li>
|
---|
| 67 | * <li>ISO-8859-1</li>
|
---|
| 68 | * </ul>
|
---|
[10862] | 69 | * <p>
|
---|
[8132] | 70 | * Its only purpose is to guess the encoding if and only if iptc tag coded character set is not set. If the
|
---|
| 71 | * encoding is not UTF-8, the tag should be set. Otherwise it is bad practice. This method tries to
|
---|
| 72 | * workaround this issue since some metadata manipulating tools do not prevent such bad practice.
|
---|
[10862] | 73 | * <p>
|
---|
[8132] | 74 | * About the reliability of this method: The check if some bytes are UTF-8 or not has a very high reliability.
|
---|
| 75 | * The two other checks are less reliable.
|
---|
| 76 | *
|
---|
| 77 | * @param bytes some text as bytes
|
---|
| 78 | * @return the name of the encoding or null if none could be guessed
|
---|
| 79 | */
|
---|
| 80 | @Nullable
|
---|
| 81 | static String guessEncoding(@NotNull final byte[] bytes)
|
---|
| 82 | {
|
---|
| 83 | String[] encodings = { UTF_8, System.getProperty("file.encoding"), ISO_8859_1 };
|
---|
| 84 |
|
---|
| 85 | for (String encoding : encodings)
|
---|
| 86 | {
|
---|
| 87 | CharsetDecoder cs = Charset.forName(encoding).newDecoder();
|
---|
| 88 |
|
---|
| 89 | try {
|
---|
| 90 | cs.decode(ByteBuffer.wrap(bytes));
|
---|
| 91 | return encoding;
|
---|
| 92 | } catch (CharacterCodingException e) {
|
---|
| 93 | // fall through...
|
---|
| 94 | }
|
---|
| 95 | }
|
---|
| 96 |
|
---|
| 97 | // No encodings succeeded. Return null.
|
---|
| 98 | return null;
|
---|
| 99 | }
|
---|
| 100 |
|
---|
| 101 | private Iso2022Converter()
|
---|
| 102 | {}
|
---|
| 103 | }
|
---|