LexicalAnalyzerSelector

/* Copyright 2002-2025 CS GROUP
 * Licensed to CS GROUP (CS) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * CS licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.orekit.files.ccsds.utils.lexical;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

import org.orekit.data.DataSource;
import org.orekit.errors.OrekitException;
import org.orekit.errors.OrekitMessages;

/** Utility class for selecting either {@link XmlLexicalAnalyzer} or {@link KvnLexicalAnalyzer} depending on
 * data first bytes.
 * @author Luc Maisonobe
 * @since 11.0
 */
public class LexicalAnalyzerSelector {

    /** Buffer size. */
    private static final int BUFFER = 4096;

    /** First bytes in XML document, UCS-4, big-endian, with Byte Order Mark. */
    private static final byte[] UCS_4_BE_BOM = {
        0x00, 0x00, -0x02, -0X01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
    };

    /** First bytes in XML document, UCS-4, little-endian, with Byte Order Mark. */
    private static final byte[] UCS_4_LE_BOM = {
        -0x01, -0X02, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
    };

    /** First bytes in XML document, UTF-16, big-endian, with Byte Order Mark. */
    private static final byte[] UTF_16_BE_BOM = {
        -0x02, -0X01, 0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
    };

    /** First bytes in XML document, UTF-16, little-endian, with Byte Order Mark. */
    private static final byte[] UTF_16_LE_BOM = {
        -0x01, -0X02, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
    };

    /** First bytes in XML document, UTF-8, endianness irrelevant, with Byte Order Mark. */
    private static final byte[] UTF_8_BOM = {
        -0x11, -0x45, -0x41, 0x3c, 0x3f, 0x78, 0x6d, 0x6c
    };

    /** First bytes in XML document, UCS-4, big-endian, without Byte Order Mark. */
    private static final byte[] UCS_4_BE = {
        0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
    };

    /** First bytes in XML document, UCS-4, little-endian, without Byte Order Mark. */
    private static final byte[] UCS_4_LE = {
        0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
    };

    /** First bytes in XML document, UTF-16, big-endian, without Byte Order Mark. */
    private static final byte[] UTF_16_BE = {
        0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
    };

    /** First bytes in XML document, UTF-16, little-endian, without Byte Order Mark. */
    private static final byte[] UTF_16_LE = {
        0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
    };

    /** First bytes in XML document, UTF-8, endianness irrelevant, without Byte Order Mark. */
    private static final byte[] UTF_8 = {
        0x3c, 0x3f, 0x78, 0x6d, 0x6c
    };

    /** First characters in XML document, with Byte Order Mark. */
    private static final String CHARS_BOM = "\ufeff<?xml";

    /** First characters in XML document, without Byte Order Mark. */
    private static final String CHARS = "<?xml";

    /** Private constructor for a utility class.
     */
    private LexicalAnalyzerSelector() {
        // never called
    }

    /** Select a {@link LexicalAnalyzer} for a {@link DataSource} based on content.
     * @param source data source to analyze
     * @return lexical analyzer suited for the data source format
     * @throws IOException if first bytes of source cannot be read
     */
    public static LexicalAnalyzer select(final DataSource source) throws IOException {
        final DataSource.Opener opener = source.getOpener();
        if (opener.rawDataIsBinary()) {
            return select(source.getName(), opener.openStreamOnce());
        } else {
            return select(source.getName(), opener.openReaderOnce());
        }
    }

    /** Select a {@link LexicalAnalyzer} based on content.
     * @param name message name
     * @param stream binary stream with message content
     * @return lexical analyzer suited for the data source format
     * @throws IOException if first bytes of source cannot be read
     */
    private static LexicalAnalyzer select(final String name, final InputStream stream) throws IOException {

        if (stream == null) {
            throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
        }
        final BufferedInputStream bis = new BufferedInputStream(stream, BUFFER);

        // read the first bytes
        final int size = UCS_4_BE_BOM.length; // UCS-4 with BOM is the longest reference sequence
        bis.mark(size);
        final byte[] first = new byte[size];
        int read = 0;
        while (read < first.length) {
            final int n = bis.read(first, read, size - read);
            if (n < 0) {
                // the file is too short for a proper CCSDS message,
                // we return arbitrarily a KVN lexical analyzer,
                // anyway, it will fail shortly during parsing
                bis.reset();
                return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
            }
            read += n;
        }

        // attempt to recognize an XML prolog, taking care of Byte Order Mark and encoding
        // we use the tables from section F of Extensible Markup Language (XML) 1.0 (Fifth Edition)
        // W3C Recommendation 26 November 2008 (https://www.w3.org/TR/2008/REC-xml-20081126/#sec-guessing),
        // ignoring the unusual octet orders 2143 and 3412
        if (checkSequence(first, UTF_8)     || checkSequence(first, UTF_8_BOM)     ||
            checkSequence(first, UTF_16_LE) || checkSequence(first, UTF_16_LE_BOM) ||
            checkSequence(first, UTF_16_BE) || checkSequence(first, UTF_16_BE_BOM) ||
            checkSequence(first, UCS_4_LE)  || checkSequence(first, UCS_4_LE_BOM)  ||
            checkSequence(first, UCS_4_BE)  || checkSequence(first, UCS_4_BE_BOM)) {
            // we recognized the "<?xml" sequence at start of an XML file
            bis.reset();
            return new XmlLexicalAnalyzer(new DataSource(name, () -> bis));
        } else {
            // it was not XML, the only other option is KVN
            bis.reset();
            return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
        }

    }

    /** Select a {@link LexicalAnalyzer} based on content.
     * @param name message name
     * @param reader character stream with message content
     * @return lexical analyzer suited for the data source format
     * @throws IOException if first bytes of source cannot be read
     */
    private static LexicalAnalyzer select(final String name, final Reader reader) throws IOException {

        if (reader == null) {
            throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
        }
        final BufferedReader br = new BufferedReader(reader, BUFFER);

        // read the first characters
        final int size = CHARS_BOM.length();
        br.mark(size);
        final char[] first = new char[size];
        int read = 0;
        while (read < first.length) {
            final int n = br.read(first, read, size - read);
            if (n < 0) {
                // the file is too short for a proper CCSDS message,
                // we return arbitrarily a KVN lexical analyzer,
                // anyway, it will fail shortly during parsing
                br.reset();
                return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
            }
            read += n;
        }
        final String firstString = new String(first);

        // attempt to recognize an XML prolog
        if (firstString.startsWith(CHARS) || CHARS_BOM.equals(firstString)) {
            // we recognized the "<?xml" sequence at start of an XML file
            br.reset();
            return new XmlLexicalAnalyzer(new DataSource(name, () -> br));
        } else {
            // it was not XML, the only other option is KVN
            br.reset();
            return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
        }

    }

    /** Check if first bytes match reference sequence.
     * @param first first bytes read
     * @param reference reference sequence
     * @return true if first bytes match reference sequence
     */
    private static boolean checkSequence(final byte[] first, final byte[] reference) {
        for (int i = 0; i < reference.length; ++i) {
            if (first[i] != reference[i]) {
                return false;
            }
        }
        return true;
    }

}