1 /* Copyright 2002-2025 CS GROUP
2 * Licensed to CS GROUP (CS) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * CS licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.orekit.files.ccsds.utils.lexical;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24
25 import org.orekit.data.DataSource;
26 import org.orekit.errors.OrekitException;
27 import org.orekit.errors.OrekitMessages;
28
29 /** Utility class for selecting either {@link XmlLexicalAnalyzer} or {@link KvnLexicalAnalyzer} depending on
30 * data first bytes.
31 * @author Luc Maisonobe
32 * @since 11.0
33 */
34 public class LexicalAnalyzerSelector {
35
36 /** Buffer size. */
37 private static final int BUFFER = 4096;
38
39 /** First bytes in XML document, UCS-4, big-endian, with Byte Order Mark. */
40 private static final byte[] UCS_4_BE_BOM = {
41 0x00, 0x00, -0x02, -0X01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
42 };
43
44 /** First bytes in XML document, UCS-4, little-endian, with Byte Order Mark. */
45 private static final byte[] UCS_4_LE_BOM = {
46 -0x01, -0X02, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
47 };
48
49 /** First bytes in XML document, UTF-16, big-endian, with Byte Order Mark. */
50 private static final byte[] UTF_16_BE_BOM = {
51 -0x02, -0X01, 0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
52 };
53
54 /** First bytes in XML document, UTF-16, little-endian, with Byte Order Mark. */
55 private static final byte[] UTF_16_LE_BOM = {
56 -0x01, -0X02, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
57 };
58
59 /** First bytes in XML document, UTF-8, endianness irrelevant, with Byte Order Mark. */
60 private static final byte[] UTF_8_BOM = {
61 -0x11, -0x45, -0x41, 0x3c, 0x3f, 0x78, 0x6d, 0x6c
62 };
63
64 /** First bytes in XML document, UCS-4, big-endian, without Byte Order Mark. */
65 private static final byte[] UCS_4_BE = {
66 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
67 };
68
69 /** First bytes in XML document, UCS-4, little-endian, without Byte Order Mark. */
70 private static final byte[] UCS_4_LE = {
71 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
72 };
73
74 /** First bytes in XML document, UTF-16, big-endian, without Byte Order Mark. */
75 private static final byte[] UTF_16_BE = {
76 0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
77 };
78
79 /** First bytes in XML document, UTF-16, little-endian, without Byte Order Mark. */
80 private static final byte[] UTF_16_LE = {
81 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
82 };
83
84 /** First bytes in XML document, UTF-8, endianness irrelevant, without Byte Order Mark. */
85 private static final byte[] UTF_8 = {
86 0x3c, 0x3f, 0x78, 0x6d, 0x6c
87 };
88
89 /** First characters in XML document, with Byte Order Mark. */
90 private static final String CHARS_BOM = "\ufeff<?xml";
91
92 /** First characters in XML document, without Byte Order Mark. */
93 private static final String CHARS = "<?xml";
94
95 /** Private constructor for a utility class.
96 */
97 private LexicalAnalyzerSelector() {
98 // never called
99 }
100
101 /** Select a {@link LexicalAnalyzer} for a {@link DataSource} based on content.
102 * @param source data source to analyze
103 * @return lexical analyzer suited for the data source format
104 * @throws IOException if first bytes of source cannot be read
105 */
106 public static LexicalAnalyzer select(final DataSource source) throws IOException {
107 final DataSource.Opener opener = source.getOpener();
108 if (opener.rawDataIsBinary()) {
109 return select(source.getName(), opener.openStreamOnce());
110 } else {
111 return select(source.getName(), opener.openReaderOnce());
112 }
113 }
114
115 /** Select a {@link LexicalAnalyzer} based on content.
116 * @param name message name
117 * @param stream binary stream with message content
118 * @return lexical analyzer suited for the data source format
119 * @throws IOException if first bytes of source cannot be read
120 */
121 private static LexicalAnalyzer select(final String name, final InputStream stream) throws IOException {
122
123 if (stream == null) {
124 throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
125 }
126 final BufferedInputStream bis = new BufferedInputStream(stream, BUFFER);
127
128 // read the first bytes
129 final int size = UCS_4_BE_BOM.length; // UCS-4 with BOM is the longest reference sequence
130 bis.mark(size);
131 final byte[] first = new byte[size];
132 int read = 0;
133 while (read < first.length) {
134 final int n = bis.read(first, read, size - read);
135 if (n < 0) {
136 // the file is too short for a proper CCSDS message,
137 // we return arbitrarily a KVN lexical analyzer,
138 // anyway, it will fail shortly during parsing
139 bis.reset();
140 return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
141 }
142 read += n;
143 }
144
145 // attempt to recognize an XML prolog, taking care of Byte Order Mark and encoding
146 // we use the tables from section F of Extensible Markup Language (XML) 1.0 (Fifth Edition)
147 // W3C Recommendation 26 November 2008 (https://www.w3.org/TR/2008/REC-xml-20081126/#sec-guessing),
148 // ignoring the unusual octet orders 2143 and 3412
149 if (checkSequence(first, UTF_8) || checkSequence(first, UTF_8_BOM) ||
150 checkSequence(first, UTF_16_LE) || checkSequence(first, UTF_16_LE_BOM) ||
151 checkSequence(first, UTF_16_BE) || checkSequence(first, UTF_16_BE_BOM) ||
152 checkSequence(first, UCS_4_LE) || checkSequence(first, UCS_4_LE_BOM) ||
153 checkSequence(first, UCS_4_BE) || checkSequence(first, UCS_4_BE_BOM)) {
154 // we recognized the "<?xml" sequence at start of an XML file
155 bis.reset();
156 return new XmlLexicalAnalyzer(new DataSource(name, () -> bis));
157 } else {
158 // it was not XML, the only other option is KVN
159 bis.reset();
160 return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
161 }
162
163 }
164
165 /** Select a {@link LexicalAnalyzer} based on content.
166 * @param name message name
167 * @param reader character stream with message content
168 * @return lexical analyzer suited for the data source format
169 * @throws IOException if first bytes of source cannot be read
170 */
171 private static LexicalAnalyzer select(final String name, final Reader reader) throws IOException {
172
173 if (reader == null) {
174 throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
175 }
176 final BufferedReader br = new BufferedReader(reader, BUFFER);
177
178 // read the first characters
179 final int size = CHARS_BOM.length();
180 br.mark(size);
181 final char[] first = new char[size];
182 int read = 0;
183 while (read < first.length) {
184 final int n = br.read(first, read, size - read);
185 if (n < 0) {
186 // the file is too short for a proper CCSDS message,
187 // we return arbitrarily a KVN lexical analyzer,
188 // anyway, it will fail shortly during parsing
189 br.reset();
190 return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
191 }
192 read += n;
193 }
194 final String firstString = new String(first);
195
196 // attempt to recognize an XML prolog
197 if (firstString.startsWith(CHARS) || CHARS_BOM.equals(firstString)) {
198 // we recognized the "<?xml" sequence at start of an XML file
199 br.reset();
200 return new XmlLexicalAnalyzer(new DataSource(name, () -> br));
201 } else {
202 // it was not XML, the only other option is KVN
203 br.reset();
204 return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
205 }
206
207 }
208
209 /** Check if first bytes match reference sequence.
210 * @param first first bytes read
211 * @param reference reference sequence
212 * @return true if first bytes match reference sequence
213 */
214 private static boolean checkSequence(final byte[] first, final byte[] reference) {
215 for (int i = 0; i < reference.length; ++i) {
216 if (first[i] != reference[i]) {
217 return false;
218 }
219 }
220 return true;
221 }
222
223 }