1 /**
2 * Copyright (c) 2008-2011, http://www.snakeyaml.org
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.yaml.snakeyaml.reader;
18
19 /**
20 version: 1.1 / 2007-01-25
21 - changed BOM recognition ordering (longer boms first)
22
23 Original pseudocode : Thomas Weidenfeller
24 Implementation tweaked: Aki Nieminen
25 Implementation changed: Andrey Somov
26 * UTF-32 removed because it is not supported by YAML
27 * no default encoding
28
29 http://www.unicode.org/unicode/faq/utf_bom.html
30 BOMs:
31 00 00 FE FF = UTF-32, big-endian
32 FF FE 00 00 = UTF-32, little-endian
33 EF BB BF = UTF-8,
34 FE FF = UTF-16, big-endian
35 FF FE = UTF-16, little-endian
36
37 Win2k Notepad:
38 Unicode format = UTF-16LE
39 ***/
40
41 import java.io.IOException;
42 import java.io.InputStream;
43 import java.io.InputStreamReader;
44 import java.io.PushbackInputStream;
45 import java.io.Reader;
46
47 /**
48 * Generic unicode textreader, which will use BOM mark to identify the encoding
49 * to be used. If BOM is not found then use a given default or system encoding.
50 */
51 public class UnicodeReader extends Reader {
52 PushbackInputStream internalIn;
53 InputStreamReader internalIn2 = null;
54
55 private static final int BOM_SIZE = 3;
56
57 /**
58 * @param in
59 * InputStream to be read
60 */
61 public UnicodeReader(InputStream in) {
62 internalIn = new PushbackInputStream(in, BOM_SIZE);
63 }
64
65 /**
66 * Get stream encoding or NULL if stream is uninitialized. Call init() or
67 * read() method to initialize it.
68 */
69 public String getEncoding() {
70 return internalIn2.getEncoding();
71 }
72
73 /**
74 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
75 * back to the stream, only BOM bytes are skipped.
76 */
77 protected void init() throws IOException {
78 if (internalIn2 != null)
79 return;
80
81 String encoding;
82 byte bom[] = new byte[BOM_SIZE];
83 int n, unread;
84 n = internalIn.read(bom, 0, bom.length);
85
86 if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
87 encoding = "UTF-8";
88 unread = n - 3;
89 } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
90 encoding = "UTF-16BE";
91 unread = n - 2;
92 } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
93 encoding = "UTF-16LE";
94 unread = n - 2;
95 } else {
96 // Unicode BOM mark not found, unread all bytes
97 encoding = "UTF-8";
98 unread = n;
99 }
100
101 if (unread > 0)
102 internalIn.unread(bom, (n - unread), unread);
103
104 // Use given encoding
105 internalIn2 = new InputStreamReader(internalIn, encoding);
106 }
107
108 public void close() throws IOException {
109 init();
110 internalIn2.close();
111 }
112
113 public int read(char[] cbuf, int off, int len) throws IOException {
114 init();
115 return internalIn2.read(cbuf, off, len);
116 }
117 }