Source Code of org.openjena.atlas.json.io.parser.TokenizerJSON

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.openjena.atlas.json.io.parser;


import static org.openjena.atlas.lib.Chars.* ;


import java.io.IOException ;
import java.util.NoSuchElementException ;


import org.openjena.atlas.io.IO ;
import org.openjena.atlas.io.PeekReader ;
import org.openjena.atlas.json.JsonParseException ;
import org.openjena.riot.tokens.Token ;
import org.openjena.riot.tokens.TokenType ;
import org.openjena.riot.tokens.Tokenizer ;






/** Tokenizer for all sorts of things JSON-ish */


public class TokenizerJSON implements Tokenizer
{
    // TODO Various allow/deny options
    
    public static final int CTRL_CHAR = CH_STAR ;
    
    private Token token = null ; 
    private final StringBuilder sb = new StringBuilder() ;
    private final PeekReader reader ;
    private boolean finished = false ;
    
    public TokenizerJSON(PeekReader reader)
    {
        this.reader = reader ;
    }
    
    @Override
    public final boolean hasNext()
    {
        if ( finished )
            return false ;
        if ( token != null )
            return true ;
        skip() ;
        if (reader.eof())
            return false ;
        token = parseToken() ;
        return token != null ;
    }
    
    @Override
    public final boolean eof()
    {
        return hasNext() ;
    }


    /** Move to next token */
    @Override
    public final Token next()
    {
        if ( ! hasNext() )
            throw new NoSuchElementException() ;
        Token t = token ;
        token = null ;
        return t ;
    }


    
    @Override
    public final Token peek()
    {
        if ( ! hasNext() ) return null ;
        return token ; 
    }
    
    @Override
    public void remove()
    { throw new UnsupportedOperationException() ; }


    // ---- Machinary
    
    // ""-string, ''-string, *X, 
    // various single characters . , : ; 
    // (), [], {}, <>
    // Numbers (integer, decimal, double)
    // Keys (restricted strings, used as keys in maps)
    //  ALPHA (ALPHA,NUMERIC,_,...)
    
    private Token parseToken()
        {
            token = new Token(getLine(), getColumn()) ;
            
            int ch = reader.peekChar() ;
    
            // ---- String
            // Support both "" and '' strings (only "" is legal JSON)
            if ( ch == CH_QUOTE1 || ch == CH_QUOTE2 )
            {
                reader.readChar() ;
                int ch2 = reader.peekChar() ;
                if (ch2 == ch )
                {
                    // Maybe """-strings/'''-strings
                    reader.readChar() ; // Read potential second quote.
                    int ch3 = reader.peekChar() ;
                    if ( ch3 == ch )
                    {
                        // """-strings/'''-strings
                        reader.readChar() ;
                        token.setImage(readLong(ch, false)) ;
                        TokenType tt = (ch == CH_QUOTE1) ? TokenType.LONG_STRING1 : TokenType.LONG_STRING2 ;
                        token.setType(tt) ;
                        return token ;
                    }
                    // Two quotes then a non-quote.
                    // Must be '' or ""
    
                    // No need to pushback characters as we know the lexical form is the empty string.
                    //if ( ch2 != EOF ) reader.pushbackChar(ch2) ;
                    //if ( ch1 != EOF ) reader.pushbackChar(ch1) ;    // Must be '' or ""
                    token.setImage("") ;
                }
                else
                    // Single quote character.
                    token.setImage(allBetween(ch, ch, true, false)) ;
                // Single quoted string.
                token.setType( (ch == CH_QUOTE1) ? TokenType.STRING1 : TokenType.STRING2 ) ;
                return token ;
            }
    
            // Control (not JSON)
            if ( ch == CTRL_CHAR )
            {
                reader.readChar() ;
                token.setType(TokenType.CNTRL) ;
                ch = reader.readChar() ;
                if ( ch == EOF )
                    exception("EOF found after "+CTRL_CHAR) ;
                token.cntrlCode = (char)ch ;
                return token ;
            }
    
            switch(ch)
            { 
                // DOT can't start a decimal in JSON.  Check for digit.
                case CH_DOT:    
    //                reader.readChar() ;
    //                ch = reader.peekChar() ;
    //                if ( range(ch, '0', '9') )
    //                {
    //                    // Not a DOT after all.
    //                    reader.pushbackChar(CH_DOT) ;
    //                    // Drop through to number code.
    //                    break ;
    //                }
                    token.setType(TokenType.DOT) ;
                    return token ;
                    
                
                case CH_SEMICOLON:  reader.readChar() ; token.setType(TokenType.SEMICOLON) ; return token ;
                case CH_COMMA:      reader.readChar() ; token.setType(TokenType.COMMA) ;     return token ;
                case CH_LBRACE:     reader.readChar() ; token.setType(TokenType.LBRACE) ;    return token ;
                case CH_RBRACE:     reader.readChar() ; token.setType(TokenType.RBRACE) ;    return token ;
                case CH_LPAREN:     reader.readChar() ; token.setType(TokenType.LPAREN) ;    return token ;
                case CH_RPAREN:     reader.readChar() ; token.setType(TokenType.RPAREN) ;    return token ;
                case CH_LBRACKET:   reader.readChar() ; token.setType(TokenType.LBRACKET) ;  return token ;
                case CH_RBRACKET:   reader.readChar() ; token.setType(TokenType.RBRACKET) ;  return token ;
    
                // Some interesting characters
                case CH_COLON:      reader.readChar() ; token.setType(TokenType.COLON) ; return token ;
    //            case CH_UNDERSCORE: reader.readChar() ; token.setType(TokenType.UNDERSCORE) ; return token ;
                case CH_LT:         reader.readChar() ; token.setType(TokenType.LT) ; return token ;
                case CH_GT:         reader.readChar() ; token.setType(TokenType.GT) ; return token ;
                // GE, LE
            }
            
            if ( ch == CH_PLUS || ch == CH_MINUS || range(ch, '0', '9'))
            {
                readNumber() ;
                return token ;
            }
    
            // Plain words and prefixes.
            //   Can't start with a number due to numeric test above.
            //   Can start with a '_' (no blank node test above)
    
            readKeyWord(token) ;
            return token ;
        }


    private void skip()
    {
        int ch = EOF ;
        for ( ;; )
        {
            if ( reader.eof() )
                return ;
    
            ch = reader.peekChar() ;
            if ( ch == CH_HASH )
            {
                reader.readChar() ;
                // Comment.  Skip to NL
                for ( ;; )
                {
                    ch = reader.peekChar() ;
                    if ( ch == EOF || isNewlineChar(ch) )
                        break ;
                    reader.readChar() ;
                }
            }
            
            // Including excess newline chars from comment.
            if ( ! isWhitespace(ch) )
                break ;
            reader.readChar() ;
        }
    }


    private void readKeyWord(Token token2)
    {
        long posn = reader.getPosition() ;
        token2.setImage(readWord(false)) ;
        token2.setType(TokenType.KEYWORD) ;
        int ch = reader.peekChar() ;


        // If we made no progress, nothing found, not even a keyword -- it's an error.
        if ( posn == reader.getPosition() )  
            exception(String.format("Unknown char: %c(%d)",ch,ch)) ;
    }
    
    private String readLong(int quoteChar, boolean endNL)
    {
        sb.setLength(0) ;
        for ( ;; )
        {
            int ch = reader.readChar() ;
            if ( ch == EOF )
            {
                if ( endNL ) return sb.toString() ; 
                exception("Broken long string") ;
            }
            
            if ( ch == quoteChar )
            {
                if ( threeQuotes(quoteChar) )
                    return sb.toString() ;
            }
            
            if ( ch == '\\' )
                ch = readLiteralEscape() ;
            insertLiteralChar(sb, ch) ;
        }
    }


    // Need "readCharOrEscape"
    
    // Assume have read the first quote char.
    // On return:
    //   If false, have moved over no more characters (due to pushbacks) 
    //   If true, at end of 3 quotes
    private boolean threeQuotes(int ch)
    {
        //reader.readChar() ;         // Read first quote.
        int ch2 = reader.peekChar() ;
        if (ch2 != ch )
        {
            //reader.pushbackChar(ch2) ;
            return false ;
        }
        
        reader.readChar() ;         // Read second quote.
        int ch3 = reader.peekChar() ;
        if ( ch3 != ch )
        {
            //reader.pushbackChar(ch3) ;
            reader.pushbackChar(ch2) ;
            return false ;
        }
            
        // Three quotes.
        reader.readChar() ;         // Read third quote.
        return true ;
    }
    
    // Read a "word": alphanumerics, "_", ".", "-"
    private String readWord(boolean leadingDigitAllowed)
    {
        sb.setLength(0) ;
        int idx = 0 ;
        if ( ! leadingDigitAllowed )
        {
            int ch = reader.peekChar() ;
            if ( Character.isDigit(ch) )
                return "" ;
        }
        
        for ( ;; idx++ )
        {
            int ch = reader.peekChar() ;
            
            if ( Character.isLetterOrDigit(ch) || ch == '_' || ch == '.' || ch == '-' )
            {
                reader.readChar() ;
                sb.append((char)ch) ;
                continue ;
            }
            else
                break ;
            
        }
        
//        // Trailing DOT?
//        // BAD : assumes pushbackChar is infinite.
//        // Check is ends in "."
//        while ( idx > 0 && sb.charAt(idx-1) == CH_DOT )
//        {
//            // Push back the dot.
//            reader.pushbackChar(CH_DOT) ;
//            sb.setLength(idx-1) ;
//            idx -- ;
//        }
        return sb.toString() ;
    }


    // Make better!
    /*
    [16]    integer         ::=     ('-' | '+') ? [0-9]+
    [17]    double          ::=     ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent )
                                    0.e0, .0e0, 0e0
    [18]    decimal         ::=     ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ )
                                    0.0 .0
    [19]    exponent        ::=     [eE] ('-' | '+')? [0-9]+
    []      hex             ::=     0x0123456789ABCDEFG
    
    */
    private void readNumber()
    {
        // One entry, definitely a number.
        // Beware of '.' as a (non) decimal.
        /*
        maybeSign()
        digits()
        if dot ==> decimal, digits
        if e   ==> double, maybeSign, digits
        else
            check not "." for decimal.
        */
        boolean isDouble = false ;
        boolean isDecimal = false ;
        sb.setLength(0) ;
        
        int x = 0 ; // Digits before a dot.
        int ch = reader.peekChar() ;
        if ( ch == '0' )
        {
            x++ ;
            reader.readChar() ;
            sb.append((char)ch) ;
            ch = reader.peekChar() ;
            if ( ch == 'x' || ch == 'X' )
            {
                reader.readChar() ;
                sb.append((char)ch) ;
                readHex(reader, sb) ;
                token.setImage(sb.toString()) ;
                token.setType(TokenType.HEX) ;
                return ;
            }
        }
        else if ( ch == '-' || ch == '+' )
        {
            readPossibleSign(sb) ;
        }
        
        
        x += readDigits(sb) ;
//        if ( x == 0 )
//        {
//            
//        }
        ch = reader.peekChar() ;
        if ( ch == CH_DOT )
        {
            reader.readChar() ;
            sb.append(CH_DOT) ;
            isDecimal = true ;  // Includes things that will be doubles.
            readDigits(sb) ;
        }
        
        if ( x == 0 && ! isDecimal )
            // Possible a tokenizer error - should not have entered readNumber in the first place.
            exception("Unrecognized as number") ;
        
        if ( exponent(sb) )
        {
            isDouble = true ;
            isDecimal = false ;
            
        }
        
        token.setImage(sb.toString()) ;
        if ( isDouble )
            token.setType(TokenType.DOUBLE) ;
        else if ( isDecimal )
            token.setType(TokenType.DECIMAL) ;
        else
            token.setType(TokenType.INTEGER) ;
    }


    
    private static void readHex(PeekReader reader, StringBuilder sb)
    {
        // Just after the 0x, which are in sb
        int x = 0 ;
        for(;;)
        {
            int ch = reader.peekChar() ;


            if ( ! range(ch, '0', '9') && ! range(ch, 'a', 'f') && ! range(ch, 'A', 'F') )
                break ;
            reader.readChar() ;
            sb.append((char)ch) ;
            x++ ;
        }
        if ( x == 0 )
            exception(reader, "No hex characters after "+sb.toString()) ;
    }


    private boolean exponent(StringBuilder sb)
    {
        int ch = reader.peekChar() ;
        if ( ch != 'e' && ch != 'E' )
            return false ;
        reader.readChar() ;
        sb.append((char)ch) ;
        readPossibleSign(sb) ;
        int x = readDigits(sb) ;
        if ( x == 0 )
            exception("Malformed double: "+sb) ;
        return true ;
    }


    private void readPossibleSign(StringBuilder sb)
    {
        int ch = reader.peekChar() ;
        if ( ch == '-' || ch == '+' )
        {
            reader.readChar() ;
            sb.append((char)ch) ;
        }
    }


    private int readDigits(StringBuilder buffer)
    {
        int count = 0 ;
        for(;;)
        {
            int ch = reader.peekChar() ;
            if ( ! range(ch, '0', '9' ) )
                break ;
            reader.readChar() ;
            buffer.append((char)ch) ;
            count ++ ;
        }
        return count ;
    }
    
    private String langTag()
    {
        sb.setLength(0) ;
        a2z(sb) ;
        if ( sb.length() == 0 )
            exception("Bad language tag") ;
        for ( ;; )
        {
            int ch = reader.peekChar() ;
            if ( ch == '-' )
            {
                reader.readChar() ;
                sb.append('-') ;
                int x = sb.length();
                a2zN(sb) ;
                if ( sb.length() == x )
                    exception("Bad language tag") ;
            }
            else
                break ;
        }
        return sb.toString();
    }
    
    private void a2z(StringBuilder sb2)
    {
        for ( ;; )
        {
            int ch = reader.peekChar() ;
            if ( isA2Z(ch) )
            {
                reader.readChar() ;
                sb.append((char)ch) ;
            }
            else
                return ;
        }
    }
    
    private void a2zN(StringBuilder sb2)
    {
        for ( ;; )
        {
            int ch = reader.peekChar() ;
            if ( isA2ZN(ch) )
            {
                reader.readChar() ;
                sb.append((char)ch) ;
            }
            else
                return ;
        }
    }


    // Blank node label: A-Z,a-z0-9 and '-'
    private String blankNodeLabel()
    {
        sb.setLength(0) ;
        boolean seen = false ;
        for(;;)
        {
            int ch = reader.readChar() ;
            if ( ch == EOF )
                break ;
            if ( ! isA2ZN(ch) && ch != '-' )
                break ;
            sb.append((char)ch) ;
            seen = true ;
        }
        if ( ! seen )
            exception("Blank node label missing") ;
        return sb.toString() ; 
    }


    
    // Get characters between two markers.
    // strEscapes may be processed
    // endNL end of line as an ending is OK
    private String allBetween(int startCh, int endCh,
                              boolean strEscapes, boolean endNL)
    {
        long y = getLine() ;
        long x = getColumn() ;
        sb.setLength(0) ;


        // Assumes first char read already.
//        int ch0 = reader.readChar() ;
//        if ( ch0 != startCh )
//            exception("Broken parser", y, x) ;


        
        for(;;)
        {
            int ch = reader.readChar() ;
            if ( ch == EOF )
            {
                if ( endNL ) return sb.toString() ; 
                exception("Broken token: "+sb.toString(), y, x) ;
            }


            if ( ch == '\n' )
                exception("Broken token (newline): "+sb.toString(), y, x) ;
            
            if ( ch == endCh )
            {
                //sb.append(((char)ch)) ;
                return sb.toString() ;
            }
            
            if ( ch == '\\' )
            {
                if ( strEscapes )
                    ch = readLiteralEscape() ;
                else
                {
                    ch = reader.readChar() ;
                    if ( ch == EOF )
                    {
                        if ( endNL ) return sb.toString() ; 
                        exception("Broken token: "+sb.toString(), y, x) ;
                    }
    
                    switch (ch)
                    {
                        case 'u': ch = readUnicode4Escape(); break ;
                        case 'U': ch = readUnicode4Escape(); break ;
                        default:
                            exception(String.format("illegal escape sequence value: %c (0x%02X)", ch, ch));
                            break ;
                    }
                }
            }
            insertLiteralChar(sb, ch) ;
        }
    }
    
    private void insertLiteralChar(StringBuilder buffer, int ch)
    {
        if ( Character.charCount(ch) == 1 )
            buffer.append((char)ch) ;
        else
        {
            // Convert to UTF-16.  Note that the rest of any systemn this is used
            // in must also respect codepoints and surrogate pairs. 
            if ( ! Character.isDefined(ch) && ! Character.isSupplementaryCodePoint(ch) )
                exception(String.format("Illegal codepoint: 0x%04X", ch)) ;
            char[] chars = Character.toChars(ch) ;
            buffer.append(chars) ;
        }
    }


    @Override
    public long getColumn()
    {
        return reader.getColNum() ;
    }


    @Override
    public long getLine()
    {
        return reader.getLineNum() ;
    }


    // ---- Character classes 
    
    @Override
    public void close()
    {
        try { reader.close() ; }
        catch (IOException ex) { IO.exception(ex) ; }
    }


    private boolean isA2Z(int ch)
    {
        return range(ch, 'a', 'z') || range(ch, 'A', 'Z') ;
    }


    private boolean isA2ZN(int ch)
    {
        return range(ch, 'a', 'z') || range(ch, 'A', 'Z') || range(ch, '0', '9') ;
    }


    private boolean isNumeric(int ch)
    {
        return range(ch, '0', '9') ;
    }
    
    private static boolean isWhitespace(int ch)
    {
        return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == '\f' ;    
    }
    
    private static boolean isNewlineChar(int ch)
    {
        return ch == '\r' || ch == '\n' ;
    }


    // ---- Escape sequences
    
    private final
    int readLiteralEscape()
    {
        int c = reader.readChar();
        if ( c==EOF )
            exception("Escape sequence not completed") ;


        switch (c)
        {
            case 'n':   return NL ; 
            case 'r':   return CR ;
            case 't':   return '\t' ;
            case '"':   return '"' ;
            case '/':   return '/' ;    // JSON requires / escapes.
            case '\'':  return '\'' ;
            case '\\':  return '\\' ;
            case 'u':   return readUnicode4Escape();
            case 'U':   return readUnicode8Escape();
            default:
                exception(String.format("illegal escape sequence value: %c (0x%02X)", c, c));
                return 0 ;
        }
    }
    
    
    private final
    int readUnicodeEscape()
    {
        int ch = reader.readChar() ;
        if ( ch == EOF )
            exception("Broken escape sequence") ;


        switch (ch)
        {
            case 'u': return readUnicode4Escape(); 
            case 'U': return readUnicode8Escape(); 
            default:
                exception(String.format("illegal escape sequence value: %c (0x%02X)", ch, ch));
        }
        return 0 ;
    }
    
    private final
    int readUnicode4Escape() { return readUnicodeEscape(4) ; }
    
    private final
    int readUnicode8Escape()
    {
        int ch8 = readUnicodeEscape(8) ;
        if ( ch8 > Character.MAX_CODE_POINT )
            exception(String.format("illegal code point in \\U sequence value: 0x%08X", ch8));
        return ch8 ;
    }
    
    private final
    int readUnicodeEscape(int N)
    {
        int x = 0 ;
        for ( int i = 0 ; i < N ; i++ )
        {
            int d = readHexChar() ;
            if ( d < 0 )
                return -1 ;
            x = (x<<4)+d ;
        }
        return x ; 
    }
    
    private final
    int readHexChar()
    {
        int ch = reader.readChar() ;
        if ( ch == EOF )
            exception("Not a hexadecimal character (end of file)") ;


        if ( range(ch, '0', '9') )
            return ch-'0' ;
        if ( range(ch, 'a', 'f') )
            return ch-'a'+10 ;
        if ( range(ch, 'A', 'F') )
            return ch-'A'+10 ;
        
        exception("Not a hexadecimal character: "+(char)ch) ;
        return -1 ; 
    }
    
    private static boolean range(int ch, char a, char b)
    {
        return ( ch >= a && ch <= b ) ;
    }


    private boolean expect(String str) {
        for (int i = 0; i < str.length(); i++) {
            char want = str.charAt(i);
            if (reader.eof())
            {
                exception("End of input during expected string: "+str) ;
                return false ;
            }
            int inChar = reader.readChar();
            if (inChar != want) {
                //System.err.println("N-triple reader error");
                exception("expected \"" + str + "\"");
                return false;
            }
        }
        return true;
    }


    private void exception(String message)
    {
        exception(message, reader.getLineNum(), reader.getColNum()) ;
    }
    
    private static void exception(PeekReader reader, String message)
    {
        exception(message, reader.getLineNum(), reader.getColNum()) ;
    }


    private static void exception(String message, long line, long col)
    {
        throw new JsonParseException(message, (int)line, (int)col) ;
    }
}
Source Code of org.openjena.atlas.json.io.parser.TokenizerJSON

Related Classes of org.openjena.atlas.json.io.parser.TokenizerJSON