Home | History | Annotate | Download | only in impl
      1 // =================================================================================================
      2 // ADOBE SYSTEMS INCORPORATED
      3 // Copyright 2006 Adobe Systems Incorporated
      4 // All Rights Reserved
      5 //
      6 // NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
      7 // of the Adobe license agreement accompanying it.
      8 // =================================================================================================
      9 
     10 
     11 
     12 package com.adobe.xmp.impl;
     13 
     14 import java.io.UnsupportedEncodingException;
     15 
     16 
     17 /**
     18  * @since   12.10.2006
     19  */
     20 public class Latin1Converter
     21 {
     22 	/** */
     23 	private static final int STATE_START = 0;
     24 	/** */
     25 	private static final int STATE_UTF8CHAR = 11;
     26 
     27 
     28 	/**
     29 	 * Private constructor
     30 	 */
     31 	private Latin1Converter()
     32 	{
     33 		// EMPTY
     34 	}
     35 
     36 
     37 	/**
     38 	 * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars.
     39 	 * The result is a buffer where those chars have been converted to UTF-8;
     40 	 * that means it contains only valid UTF-8 chars.
     41 	 * <p>
     42 	 * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking
     43 	 * at the first four bytes (that works only if the buffer starts with an ASCII-char,
     44 	 * like xmls &apos;&lt;&apos;). UTF-16/32 flavours do not require further proccessing.
     45 	 * <p>
     46 	 * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of
     47 	 * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte
     48 	 * sequence.
     49 	 * <p>
     50 	 * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code
     51 	 * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined
     52 	 * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a
     53 	 * space.
     54 	 * <p>
     55 	 * The official Latin-1 characters in the range 0xA0..0xFF are converted into
     56 	 * the Unicode Latin Supplement range U+00A0 - U+00FF.
     57 	 * <p>
     58 	 * <em>Example:</em> If an Euro-symbol () appears in the byte buffer (0xE2, 0x82, 0xAC),
     59 	 * it will be left as is. But if only the first two bytes are appearing,
     60 	 * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to
     61 	 * 0xC3, 0xA2 () - 0xE2, 0x80, 0x9A () - 0x41 (a).
     62 	 *
     63 	 * @param buffer a byte buffer contain
     64 	 * @return Returns a new buffer containing valid UTF-8
     65 	 */
     66 	public static ByteBuffer convert(ByteBuffer buffer)
     67 	{
     68 		if ("UTF-8".equals(buffer.getEncoding()))
     69 		{
     70 			// the buffer containing one UTF-8 char (up to 8 bytes)
     71 			byte[] readAheadBuffer = new byte[8];
     72 			// the number of bytes read ahead.
     73 			int readAhead  = 0;
     74 			// expected UTF8 bytesto come
     75 			int expectedBytes = 0;
     76 			// output buffer with estimated length
     77 			ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3);
     78 
     79 			int state = STATE_START;
     80 			for (int i = 0; i < buffer.length(); i++)
     81 			{
     82 				int b = buffer.charAt(i);
     83 
     84 				switch (state)
     85 				{
     86 					default:
     87 					case STATE_START:
     88 						if (b < 0x7F)
     89 						{
     90 							out.append((byte) b);
     91 						}
     92 						else if (b >= 0xC0)
     93 						{
     94 							// start of UTF8 sequence
     95 							expectedBytes = -1;
     96 							int test = b;
     97 							for (; expectedBytes < 8  &&  (test & 0x80) == 0x80; test = test << 1)
     98 							{
     99 								expectedBytes++;
    100 							}
    101 							readAheadBuffer[readAhead++] = (byte) b;
    102 							state = STATE_UTF8CHAR;
    103 						}
    104 						else //  implicitly:  b >= 0x80  &&  b < 0xC0
    105 						{
    106 							// invalid UTF8 start char, assume to be Latin-1
    107 							byte[] utf8 = convertToUTF8((byte) b);
    108 							out.append(utf8);
    109 						}
    110 						break;
    111 
    112 					case STATE_UTF8CHAR:
    113 						if (expectedBytes > 0  &&  (b & 0xC0) == 0x80)
    114 						{
    115 							// valid UTF8 char, add to readAheadBuffer
    116 							readAheadBuffer[readAhead++] = (byte) b;
    117 							expectedBytes--;
    118 
    119 							if (expectedBytes == 0)
    120 							{
    121 								out.append(readAheadBuffer, 0, readAhead);
    122 								readAhead = 0;
    123 
    124 								state = STATE_START;
    125 							}
    126 						}
    127 						else
    128 						{
    129 							// invalid UTF8 char:
    130 							// 1. convert first of seq to UTF8
    131 							byte[] utf8 = convertToUTF8(readAheadBuffer[0]);
    132 							out.append(utf8);
    133 
    134 							// 2. continue processing at second byte of sequence
    135 							i = i - readAhead;
    136 							readAhead = 0;
    137 
    138 							state = STATE_START;
    139 						}
    140 						break;
    141 				}
    142 			}
    143 
    144 			// loop ends with "half" Utf8 char --> assume that the bytes are Latin-1
    145 			if (state == STATE_UTF8CHAR)
    146 			{
    147 				for (int j = 0; j < readAhead; j++)
    148 				{
    149 					byte b = readAheadBuffer[j];
    150 					byte[] utf8 = convertToUTF8(b);
    151 					out.append(utf8);
    152 				}
    153 			}
    154 
    155 			return out;
    156 		}
    157 		else
    158 		{
    159 			// Latin-1 fixing applies only to UTF-8
    160 			return buffer;
    161 		}
    162 	}
    163 
    164 
    165 	/**
    166 	 * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a
    167 	 * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are
    168 	 * formally undefined by Windows 1252 and therefore replaced by a space
    169 	 * (0x20).
    170 	 *
    171 	 * @param ch
    172 	 *            an Cp1252 / Latin-1 byte
    173 	 * @return Returns a byte array containing a UTF-8 byte sequence.
    174 	 */
    175 	private static byte[] convertToUTF8(byte ch)
    176 	{
    177 		int c = ch & 0xFF;
    178 		try
    179 		{
    180 			if (c >= 0x80)
    181 			{
    182 				if (c == 0x81  ||  c == 0x8D  ||  c == 0x8F  ||  c == 0x90  ||  c == 0x9D)
    183 				{
    184 					return new byte[] { 0x20 }; // space for undefined
    185 				}
    186 
    187 				// interpret byte as Windows Cp1252 char
    188 				return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8");
    189 			}
    190 		}
    191 		catch (UnsupportedEncodingException e)
    192 		{
    193 			// EMPTY
    194 		}
    195 		return new byte[] { ch };
    196 	}
    197 }
    198