1   /* Copyright 2002-2025 CS GROUP
2    * Licensed to CS GROUP (CS) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * CS licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *   http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.orekit.files.ccsds.utils.lexical;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.io.Reader;
24  
25  import org.orekit.data.DataSource;
26  import org.orekit.errors.OrekitException;
27  import org.orekit.errors.OrekitMessages;
28  
29  /** Utility class for selecting either {@link XmlLexicalAnalyzer} or {@link KvnLexicalAnalyzer} depending on
30   * data first bytes.
31   * @author Luc Maisonobe
32   * @since 11.0
33   */
34  public class LexicalAnalyzerSelector {
35  
36      /** Buffer size. */
37      private static final int BUFFER = 4096;
38  
39      /** First bytes in XML document, UCS-4, big-endian, with Byte Order Mark. */
40      private static final byte[] UCS_4_BE_BOM = {
41          0x00, 0x00, -0x02, -0X01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
42      };
43  
44      /** First bytes in XML document, UCS-4, little-endian, with Byte Order Mark. */
45      private static final byte[] UCS_4_LE_BOM = {
46          -0x01, -0X02, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
47      };
48  
49      /** First bytes in XML document, UTF-16, big-endian, with Byte Order Mark. */
50      private static final byte[] UTF_16_BE_BOM = {
51          -0x02, -0X01, 0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
52      };
53  
54      /** First bytes in XML document, UTF-16, little-endian, with Byte Order Mark. */
55      private static final byte[] UTF_16_LE_BOM = {
56          -0x01, -0X02, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
57      };
58  
59      /** First bytes in XML document, UTF-8, endianness irrelevant, with Byte Order Mark. */
60      private static final byte[] UTF_8_BOM = {
61          -0x11, -0x45, -0x41, 0x3c, 0x3f, 0x78, 0x6d, 0x6c
62      };
63  
64      /** First bytes in XML document, UCS-4, big-endian, without Byte Order Mark. */
65      private static final byte[] UCS_4_BE = {
66          0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
67      };
68  
69      /** First bytes in XML document, UCS-4, little-endian, without Byte Order Mark. */
70      private static final byte[] UCS_4_LE = {
71          0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
72      };
73  
74      /** First bytes in XML document, UTF-16, big-endian, without Byte Order Mark. */
75      private static final byte[] UTF_16_BE = {
76          0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
77      };
78  
79      /** First bytes in XML document, UTF-16, little-endian, without Byte Order Mark. */
80      private static final byte[] UTF_16_LE = {
81          0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
82      };
83  
84      /** First bytes in XML document, UTF-8, endianness irrelevant, without Byte Order Mark. */
85      private static final byte[] UTF_8 = {
86          0x3c, 0x3f, 0x78, 0x6d, 0x6c
87      };
88  
89      /** First characters in XML document, with Byte Order Mark. */
90      private static final String CHARS_BOM = "\ufeff<?xml";
91  
92      /** First characters in XML document, without Byte Order Mark. */
93      private static final String CHARS = "<?xml";
94  
95      /** Private constructor for a utility class.
96       */
97      private LexicalAnalyzerSelector() {
98          // never called
99      }
100 
101     /** Select a {@link LexicalAnalyzer} for a {@link DataSource} based on content.
102      * @param source data source to analyze
103      * @return lexical analyzer suited for the data source format
104      * @throws IOException if first bytes of source cannot be read
105      */
106     public static LexicalAnalyzer select(final DataSource source) throws IOException {
107         final DataSource.Opener opener = source.getOpener();
108         if (opener.rawDataIsBinary()) {
109             return select(source.getName(), opener.openStreamOnce());
110         } else {
111             return select(source.getName(), opener.openReaderOnce());
112         }
113     }
114 
115     /** Select a {@link LexicalAnalyzer} based on content.
116      * @param name message name
117      * @param stream binary stream with message content
118      * @return lexical analyzer suited for the data source format
119      * @throws IOException if first bytes of source cannot be read
120      */
121     private static LexicalAnalyzer select(final String name, final InputStream stream) throws IOException {
122 
123         if (stream == null) {
124             throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
125         }
126         final BufferedInputStream bis = new BufferedInputStream(stream, BUFFER);
127 
128         // read the first bytes
129         final int size = UCS_4_BE_BOM.length; // UCS-4 with BOM is the longest reference sequence
130         bis.mark(size);
131         final byte[] first = new byte[size];
132         int read = 0;
133         while (read < first.length) {
134             final int n = bis.read(first, read, size - read);
135             if (n < 0) {
136                 // the file is too short for a proper CCSDS message,
137                 // we return arbitrarily a KVN lexical analyzer,
138                 // anyway, it will fail shortly during parsing
139                 bis.reset();
140                 return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
141             }
142             read += n;
143         }
144 
145         // attempt to recognize an XML prolog, taking care of Byte Order Mark and encoding
146         // we use the tables from section F of Extensible Markup Language (XML) 1.0 (Fifth Edition)
147         // W3C Recommendation 26 November 2008 (https://www.w3.org/TR/2008/REC-xml-20081126/#sec-guessing),
148         // ignoring the unusual octet orders 2143 and 3412
149         if (checkSequence(first, UTF_8)     || checkSequence(first, UTF_8_BOM)     ||
150             checkSequence(first, UTF_16_LE) || checkSequence(first, UTF_16_LE_BOM) ||
151             checkSequence(first, UTF_16_BE) || checkSequence(first, UTF_16_BE_BOM) ||
152             checkSequence(first, UCS_4_LE)  || checkSequence(first, UCS_4_LE_BOM)  ||
153             checkSequence(first, UCS_4_BE)  || checkSequence(first, UCS_4_BE_BOM)) {
154             // we recognized the "<?xml" sequence at start of an XML file
155             bis.reset();
156             return new XmlLexicalAnalyzer(new DataSource(name, () -> bis));
157         } else {
158             // it was not XML, the only other option is KVN
159             bis.reset();
160             return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
161         }
162 
163     }
164 
165     /** Select a {@link LexicalAnalyzer} based on content.
166      * @param name message name
167      * @param reader character stream with message content
168      * @return lexical analyzer suited for the data source format
169      * @throws IOException if first bytes of source cannot be read
170      */
171     private static LexicalAnalyzer select(final String name, final Reader reader) throws IOException {
172 
173         if (reader == null) {
174             throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
175         }
176         final BufferedReader br = new BufferedReader(reader, BUFFER);
177 
178         // read the first characters
179         final int size = CHARS_BOM.length();
180         br.mark(size);
181         final char[] first = new char[size];
182         int read = 0;
183         while (read < first.length) {
184             final int n = br.read(first, read, size - read);
185             if (n < 0) {
186                 // the file is too short for a proper CCSDS message,
187                 // we return arbitrarily a KVN lexical analyzer,
188                 // anyway, it will fail shortly during parsing
189                 br.reset();
190                 return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
191             }
192             read += n;
193         }
194         final String firstString = new String(first);
195 
196         // attempt to recognize an XML prolog
197         if (firstString.startsWith(CHARS) || CHARS_BOM.equals(firstString)) {
198             // we recognized the "<?xml" sequence at start of an XML file
199             br.reset();
200             return new XmlLexicalAnalyzer(new DataSource(name, () -> br));
201         } else {
202             // it was not XML, the only other option is KVN
203             br.reset();
204             return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
205         }
206 
207     }
208 
209     /** Check if first bytes match reference sequence.
210      * @param first first bytes read
211      * @param reference reference sequence
212      * @return true if first bytes match reference sequence
213      */
214     private static boolean checkSequence(final byte[] first, final byte[] reference) {
215         for (int i = 0; i < reference.length; ++i) {
216             if (first[i] != reference[i]) {
217                 return false;
218             }
219         }
220         return true;
221     }
222 
223 }