Lexer.java

/* Copyright 2002-2024 CS GROUP
 * Licensed to CS GROUP (CS) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * CS licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.orekit.utils.units;

import org.hipparchus.fraction.Fraction;
import org.orekit.errors.OrekitException;
import org.orekit.errors.OrekitMessages;

/** Lexer for units.
 * @author Luc Maisonobe
 * @since 11.0
 */
class Lexer {

    /** Unit specification to tokenize. */
    private final CharSequence unitSpecification;

    /** End index. */
    private final int end;

    /** Start index for next token. */
    private int start;

    /** Next to last token emitted. */
    private Token nextToLast;

    /** Last token emitted. */
    private Token last;

    /** Upcoming token (which was pushed back). */
    private Token upcoming;

    /** Build a lexer for a unit specification.
     * @param unitSpecification unit specification to tokenize
     */
    Lexer(final CharSequence unitSpecification) {
        this.unitSpecification = unitSpecification;
        this.end               = unitSpecification.length();
        this.start             = 0;
        this.last              = null;
    }

    /** Get the complete unit specification.
     * @return complete unit specification
     */
    public String getUnitSpecification() {
        return unitSpecification.toString();
    }

    /** Push back last returned token.
     * <p>
     * This can be called only once
     * </p>
     */
    public void pushBack() {
        upcoming = last;
        last     = nextToLast;
    }

    /** Get next token.
     * @return next token, or null if there are no more tokens
     */
    public Token next() {

        if (upcoming != null) {
            nextToLast = last;
            last       = upcoming;
            upcoming   = null;
            return last;
        }

        // skip whitespace
        while (start < end && Character.isWhitespace(unitSpecification.charAt(start))) {
            ++start;
        }

        if (start >= end) {
            // no more characters to analyze
            nextToLast = last;
            last       = null;
            return null;
        }

        // look for prefixed units
        int current = start;
        while (current < end &&
               (Character.isLowerCase(unitSpecification.charAt(current)) ||
                Character.isUpperCase(unitSpecification.charAt(current)) ||
                unitSpecification.charAt(current) == '°'  ||
                unitSpecification.charAt(current) == '◦'  ||
                unitSpecification.charAt(current) == '′'  ||
                unitSpecification.charAt(current) == '\'' ||
                unitSpecification.charAt(current) == '″'  ||
                unitSpecification.charAt(current) == '"'  ||
                unitSpecification.charAt(current) == '%'  ||
                unitSpecification.charAt(current) == '#')) {
            ++current;
        }
        if (current > start) {
            return emit(current, TokenType.IDENTIFIER, 0, 1);
        }

        // look for power
        if (start < end - 1 &&
            unitSpecification.charAt(start)     == '*' &&
            unitSpecification.charAt(start + 1) == '*') {
            // power indicator as **
            return emit(start + 2, TokenType.POWER, 0, 1);
        } else if (unitSpecification.charAt(start) == '^') {
            // power indicator as ^
            return emit(start + 1, TokenType.POWER, 0, 1);
        } else if (convertSuperscript(start) != ' ' &&
                   last != null &&
                   last.getType() != TokenType.POWER) {
            // virtual power indicator as we switch to superscript characters
            return emit(start, TokenType.POWER, 0, 1);
        }

        // look for one character tokens
        if (unitSpecification.charAt(start) == '*') {
            return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
        } else if (unitSpecification.charAt(start) == '×') {
            return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
        } else if (unitSpecification.charAt(start) == '.') {
            return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
        } else if (unitSpecification.charAt(start) == '·') {
            return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
        } else if (unitSpecification.charAt(start) == '/') {
            return emit(start + 1, TokenType.DIVISION, 0, 1);
        } else if (unitSpecification.charAt(start) == '⁄') {
            return emit(start + 1, TokenType.DIVISION, 0, 1);
        } else if (unitSpecification.charAt(start) == '(') {
            return emit(start + 1, TokenType.OPEN, 0, 1);
        } else if (unitSpecification.charAt(start) == ')') {
            return emit(start + 1, TokenType.CLOSE, 0, 1);
        } else if (unitSpecification.charAt(start) == '√') {
            return emit(start + 1, TokenType.SQUARE_ROOT, 0, 1);
        }

        // look for special case "0.5" (used by CCSDS for square roots)
        if (start < end - 2 &&
             unitSpecification.charAt(start)     == '0' &&
             unitSpecification.charAt(start + 1) == '.' &&
             unitSpecification.charAt(start + 2) == '5') {
            // ½ written as decimal number
            return emit(start + 3, TokenType.FRACTION, 1, 2);
        }

        // look for special case "1.5" (used by CCSDS for power 3/2)
        if (start < end - 2 &&
             unitSpecification.charAt(start)     == '1' &&
             unitSpecification.charAt(start + 1) == '.' &&
             unitSpecification.charAt(start + 2) == '5') {
            // 3/2 written as decimal number
            return emit(start + 3, TokenType.FRACTION, 3, 2);
        }

        // look for unicode fractions
        if (unitSpecification.charAt(start) == '¼') {
            return emit(start + 1, TokenType.FRACTION, 1, 4);
        } else if (unitSpecification.charAt(start) == '½') {
            return emit(start + 1, TokenType.FRACTION, 1, 2);
        } else if (unitSpecification.charAt(start) == '¾') {
            return emit(start + 1, TokenType.FRACTION, 3, 4);
        } else if (unitSpecification.charAt(start) == '⅐') {
            return emit(start + 1, TokenType.FRACTION, 1, 7);
        } else if (unitSpecification.charAt(start) == '⅑') {
            return emit(start + 1, TokenType.FRACTION, 1, 9);
        } else if (unitSpecification.charAt(start) == '⅒') {
            return emit(start + 1, TokenType.FRACTION, 1, 10);
        } else if (unitSpecification.charAt(start) == '⅓') {
            return emit(start + 1, TokenType.FRACTION, 1, 3);
        } else if (unitSpecification.charAt(start) == '⅔') {
            return emit(start + 1, TokenType.FRACTION, 2, 3);
        } else if (unitSpecification.charAt(start) == '⅕') {
            return emit(start + 1, TokenType.FRACTION, 1, 5);
        } else if (unitSpecification.charAt(start) == '⅖') {
            return emit(start + 1, TokenType.FRACTION, 2, 5);
        } else if (unitSpecification.charAt(start) == '⅗') {
            return emit(start + 1, TokenType.FRACTION, 3, 5);
        } else if (unitSpecification.charAt(start) == '⅘') {
            return emit(start + 1, TokenType.FRACTION, 4, 5);
        } else if (unitSpecification.charAt(start) == '⅙') {
            return emit(start + 1, TokenType.FRACTION, 1, 6);
        } else if (unitSpecification.charAt(start) == '⅚') {
            return emit(start + 1, TokenType.FRACTION, 5, 6);
        } else if (unitSpecification.charAt(start) == '⅛') {
            return emit(start + 1, TokenType.FRACTION, 1, 8);
        } else if (unitSpecification.charAt(start) == '⅜') {
            return emit(start + 1, TokenType.FRACTION, 3, 8);
        } else if (unitSpecification.charAt(start) == '⅝') {
            return emit(start + 1, TokenType.FRACTION, 5, 8);
        } else if (unitSpecification.charAt(start) == '⅞') {
            return emit(start + 1, TokenType.FRACTION, 7, 8);
        }

        // it must be an integer, either as regular character or as superscript
        final Converter converter = (convertSuperscript(start) == ' ') ?
                                    this::noConvert :
                                    this::convertSuperscript;

        // manage sign, taking care of counting characters properly
        final int sign;
        final int numberStart;
        if (converter.convert(start) == '+') {
            sign        = +1;
            numberStart = start + 1;
        } else if (converter.convert(start) == '-') {
            sign        = -1;
            numberStart = start + 1;
        } else {
            sign        = 1;
            numberStart = start;
        }
        current = numberStart;

        int value = 0;
        while (current < end) {
            final int c = converter.convert(current);
            if (c >= '0' && c <= '9') {
                value = value * 10 + (c - '0');
                ++current;
            } else {
                break;
            }
        }
        if (current > numberStart) {
            // there were some digits
            return emit(current, TokenType.INTEGER, sign * value, 1);
        }

        throw generateException();

    }

    /** Generate an exception.
     * @return generated exception
     */
    public OrekitException generateException() {
        return new OrekitException(OrekitMessages.UNKNOWN_UNIT, unitSpecification);
    }

    /** Emit one token.
     * @param after index after token
     * @param type token type
     * @param numerator value of the token numerator
     * @param denominator value of the token denominator
     * @return new token
     */
    private Token emit(final int after, final TokenType type, final int numerator, final int denominator) {
        final CharSequence subString = unitSpecification.subSequence(start, after);
        start      = after;
        nextToLast = last;
        last       = new Token(subString, type, numerator,
                               denominator == 1 ? null : new Fraction(numerator, denominator));
        return last;
    }

    /** Convert a superscript character to regular digit or sign character.
     * @param index character index
     * @return regular digit or sign character, or ' ' if character is not a superscript
     */
    private char convertSuperscript(final int index) {
        // we can't do fancy stuff with code points
        // superscripts for 1, 2 and 3 are not in the same range as others
        switch (unitSpecification.charAt(index)) {
            case '⁰' :
                return '0';
            case '¹' :
                return '1';
            case '²' :
                return '2';
            case '³' :
                return '3';
            case '⁴' :
                return '4';
            case '⁵' :
                return '5';
            case '⁶' :
                return '6';
            case '⁷' :
                return '7';
            case '⁸' :
                return '8';
            case '⁹' :
                return '9';
            case '⁺' :
                return '+';
            case '⁻' :
                return '-';
            default :
                return ' ';
        }

    }

    /** No-op converter.
     * @param index character index
     * @return character at index
     */
    private char noConvert(final int index) {
        return unitSpecification.charAt(index);
    }

    /** Character converter. */
    private interface Converter {
        /** Convert a character.
         * @param index character index
         * @return converted character
         */
        char convert(int index);
    }

}