Package org.openjena.riot.tokens

Source Code of org.openjena.riot.tokens.TokenizerBytes

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.openjena.riot.tokens;

import static org.openjena.atlas.lib.Chars.* ;
import static org.openjena.riot.system.RiotChars.isA2Z ;
import static org.openjena.riot.system.RiotChars.isA2ZN ;
import static org.openjena.riot.system.RiotChars.isAlphaNumeric ;
import static org.openjena.riot.system.RiotChars.isNewlineChar ;
import static org.openjena.riot.system.RiotChars.isWhitespace ;
import static org.openjena.riot.system.RiotChars.range ;
import static org.openjena.riot.system.RiotChars.valHexChar ;

import java.io.IOException ;
import java.util.NoSuchElementException ;

import org.openjena.atlas.AtlasException ;
import org.openjena.atlas.io.IO ;
import org.openjena.atlas.io.PeekInputStream ;
import org.openjena.atlas.io.InStreamUTF8 ;
import org.openjena.riot.RiotParseException ;


/** Tokenizer for all sorts of things RDF-ish */

public final class TokenizerBytes implements Tokenizer
{
    // UNFINISHED
    // Almost certainly out of date.
   
    // This class works directly on bytes but can't handle some multi-byte
    // cases (e.g. start of a prefixed name).  Better to work in character space
    // (conversion has to be done anyway) even if it's over a very simple
    // a simple UTF-8 decoder or the standard Java one (which is quite
    // efficient if done in large blocks of bytes).
   
    /* Better - TokenizerBase
     * Abstract InputSource 
     *   Iterator
     *     abstract "int nextCharOrByte()" and "char nextChar"
     *     CharConvert(inputstream) // CharConvert(chByte, inputstream)
     *     accumulate(stringBuffer,  
     *     TokenizerBase works in int or char
     *     abstract scanners.
     *  insertLiteralChar
     */
   
    // Byte-based tokenizer.
    // Assumes that any marker chars are bytes (code points less than 128)
   
    // Currently, this code is only ASCII because it does byte->char 1:1
   
    // Space for CURIEs, stricter Turtle QNames, sane Turtle (i.e. leading digits in local part).
    public static final int CTRL_CHAR = B_STAR ;
    public static boolean Checking = false ;
    private Token token = null ;
    private final StringBuilder stringBuilder = new StringBuilder(200) ;
    private final PeekInputStream inputStream ;
   
    private boolean finished = false ;
    private TokenChecker checker = null ; // new CheckerBase()  ;
   
    /*package*/ TokenizerBytes(PeekInputStream inputStream)
    {
        this.inputStream = inputStream ;
    }
   
    // Share with TokenizerText
    @Override
    public final boolean hasNext()
    {
        if ( finished )
            return false ;
        if ( token != null )
            return true ;
       
        try {
            skip() ;
            if (inputStream.eof())
            {
                //close() ;
                finished = true ;
                return false ;
            }
            token = parseToken() ;
            if ( token == null )
            {
                //close() ;
                finished = true ;
                return false ;
            }
            return true ;
        } catch (AtlasException ex)
        {
            if ( ex.getCause().getClass() == java.nio.charset.MalformedInputException.class )
                throw new RiotParseException("Bad character encoding", inputStream.getLineNum(), inputStream.getColNum()) ;
            throw new RiotParseException("Bad input stream", inputStream.getLineNum(), inputStream.getColNum()) ;
        }
    }
   
    @Override
    public final Token next()
    {
        if ( ! hasNext() )
            throw new NoSuchElementException() ;
        Token t = token ;
        token = null ;
        return t ;
    }
   
   
    @Override
    public final Token peek()
    {
        if ( ! hasNext() ) return null ;
        return token ;
    }
   
    @Override
    public final boolean eof()
    {
        return hasNext() ;
    }
   
    @Override
    public void remove()
    { throw new UnsupportedOperationException() ; }

    public TokenChecker getChecker() { return checker ; }
    public void setChecker(TokenChecker checker) { this.checker = checker ; }

    @Override
    public void close()
    {
        try { inputStream.close() ; }
        catch (IOException ex) { IO.exception(ex) ; }
    }

    // ---- Machinary
   
    private void skip()
    {
        int ch = EOF ;
        for ( ;; )
        {
            if ( inputStream.eof() )
                return ;
   
            ch = inputStream.peekByte() ;
            if ( ch == B_HASH )
            {
                inputStream.readByte() ;
                // Comment.  Skip to NL
                for ( ;; )
                {
                    ch = inputStream.peekByte() ;
                    if ( ch == EOF || isNewlineChar(ch) )
                        break ;
                    inputStream.readByte() ;
                }
            }
           
            // Including excess newline chars from comment.
            if ( ! isWhitespace(ch) )
                break ;
            inputStream.readByte() ;
        }
    }

    private Token parseToken()
    {
        token = new Token(getLine(), getColumn()) ;
       
        int chByte = inputStream.peekByte() ;

        // ---- IRI
        if ( chByte == B_LT )
        {
            inputStream.readByte() ;
            token.setImage(allBetween(B_LT, B_GT, false, false)) ;
            token.setType(TokenType.IRI) ;
            if ( Checking ) checkURI(token.getImage()) ;
            return token ;
        }

        // ---- Literal
        if ( chByte == B_QUOTE1 || chByte == B_QUOTE2 )
        {
            inputStream.readByte() ;
            int ch2 = inputStream.peekByte() ;
            if (ch2 == chByte )
            {
                inputStream.readByte() ; // Read second quote.
                int ch3 = inputStream.peekByte() ;
                if ( ch3 == chByte )
                {
                    inputStream.readByte() ;
                    token.setImage(readLongString(chByte, false)) ;
                    TokenType tt = (chByte == B_QUOTE1) ? TokenType.LONG_STRING1 : TokenType.LONG_STRING2 ;
                    token.setType(tt) ;
                }
                else
                {
                    // Two quotes then a non-quote.
                    // Must be '' or ""
                    // No need to pushback characters as we know the lexical form is the empty string.
                    //if ( ch2 != EOF ) inputStream.pushbackChar(ch2) ;
                    //if ( ch1 != EOF ) inputStream.pushbackChar(ch1) ;    // Must be '' or ""
                    token.setImage("") ;
                    token.setType( (chByte == B_QUOTE1) ? TokenType.STRING1 : TokenType.STRING2 ) ;
                }
            }
            else
            {
                // Single quote character.
                token.setImage(allBetween(chByte, chByte, true, false)) ;
                // Single quoted string.
                token.setType( (chByte == B_QUOTE1) ? TokenType.STRING1 : TokenType.STRING2 ) ;
            }
           
            // Literal.  Is it @ or ^^
            if ( inputStream.peekByte() == B_AT )
            {
                inputStream.readByte() ;
                token.setImage2(langTag()) ;
                token.setType(TokenType.LITERAL_LANG) ;
                if ( Checking ) checkLiteralLang(token.getImage(), token.getImage2()) ;
            }
            else if ( inputStream.peekByte() == '^' )
            {
                expect("^^") ;
               
                // Recursive call!
                // Check no whitespace.
                int nextCh = inputStream.peekByte() ;
                if ( isWhitespace(nextCh) )
                    exception("No whitespace after ^^ in literal with datatype") ;
                // Stash current token.
                Token mainToken = token ;
                Token subToken = parseToken() ;
                if ( ! subToken.isIRI() )
                    exception("Datatype URI required after ^^ - URI or prefixed name expected") ;
                token = mainToken ;
                token.setSubToken(subToken) ;
                token.setType(TokenType.LITERAL_DT) ;
                if ( Checking ) checkLiteralDT(token.getImage(), subToken) ;
            }
            else
            {
                // Was a simple string.
                if ( Checking ) checkString(token.getImage()) ;
            }
            return token ;
        }

        if ( chByte == B_UNDERSCORE )        // Blank node :label must be at least one char
        {
            expect("_:") ;
            token.setImage(readBlankNodeLabel()) ;
            token.setType(TokenType.BNODE) ;
            if ( Checking ) checkBlankNode(token.getImage()) ;
            return token ;
        }

        // Control
        if ( chByte == CTRL_CHAR )
        {
            inputStream.readByte() ;
            token.setType(TokenType.CNTRL) ;
            chByte = inputStream.readByte() ;
            if ( chByte == EOF )
                exception("EOF found after "+CTRL_CHAR) ;
            token.cntrlCode = (char)chByte ;
            if ( Checking ) checkControl(token.cntrlCode) ;
            return token ;
        }

        if ( chByte == B_AT )
        {
            inputStream.readByte() ;
            token.setType(TokenType.DIRECTIVE) ;
            token.setImage(readWord(false)) ;
            if ( Checking ) checkDirective(token.cntrlCode) ;
            return token ;
        }
       
        if ( chByte == B_QMARK )
        {
            inputStream.readByte() ;
            token.setType(TokenType.VAR) ;
            // Character set?
            token.setImage(readWord(true)) ;
            if ( Checking ) checkVariable(token.getImage()) ;
            return token ;
        }
       
        // Number?
        switch(chByte)
        {
            // DOT can start a decimal.  Check for digit.
            case B_DOT:
                inputStream.readByte() ;
                chByte = inputStream.peekByte() ;
                if ( range(chByte, '0', '9') )
                {
                    // Not a DOT after all.
                    inputStream.pushbackByte(B_DOT) ;
                    readNumber() ;
                    return token ;
                }
                token.setType(TokenType.DOT) ;
                return token ;
           
            case B_SEMICOLON:  inputStream.readByte() ; token.setType(TokenType.SEMICOLON) ; token.setImage(";") ; return token ;
            case B_COMMA:      inputStream.readByte() ; token.setType(TokenType.COMMA) ;     token.setImage(",") ; return token ;
            case B_LBRACE:     inputStream.readByte() ; token.setType(TokenType.LBRACE) ;    token.setImage("{") ; return token ;
            case B_RBRACE:     inputStream.readByte() ; token.setType(TokenType.RBRACE) ;    token.setImage("}") ; return token ;
            case B_LPAREN:     inputStream.readByte() ; token.setType(TokenType.LPAREN) ;    token.setImage("(") ; return token ;
            case B_RPAREN:     inputStream.readByte() ; token.setType(TokenType.RPAREN) ;    token.setImage(")") ; return token ;
            case B_LBRACKET:   inputStream.readByte() ; token.setType(TokenType.LBRACKET) ;  token.setImage("[") ; return token ;
            case B_RBRACKET:   inputStream.readByte() ; token.setType(TokenType.RBRACKET) ;  token.setImage("]") ; return token ;

            // Specials (if processing off) -- FIX ME
            //case B_COLON:      inputStream.readByte() ; token.setType(TokenType.COLON) ; return token ;
            case B_UNDERSCORE: inputStream.readByte() ; token.setType(TokenType.UNDERSCORE) ; token.setImage("_") ; return token ;
            case B_LT:         inputStream.readByte() ; token.setType(TokenType.LT) ; token.setImage("<") ; return token ;
            case B_GT:         inputStream.readByte() ; token.setType(TokenType.GT) ; token.setImage(">") ; return token ;
            // GE, LE
            // Single character symbols for * / + -

//            case B_PLUS:
//            case B_MINUS:
//            case B_STAR:
//            case B_SLASH:
//            case B_RSLASH:
               
        }
       
       
        if ( chByte == B_PLUS || chByte == B_MINUS || range(chByte, '0', '9'))
        {
            readNumber() ;
            if ( Checking ) checkNumber(token.getImage(), token.getImage2() ) ;
            return token ;
        }

        // Plain words and prefixes.
        //   Can't start with a number due to numeric test above.
        //   Can't start with a '_' due to blank node test above.
        // If we see a :, the first time it means a prefixed name else it's a token break.

        readPrefixedNameOrKeyWord(token) ;
       
        if ( Checking ) checkKeyword(token.getImage()) ;
        return token ;
    }

   
    private void readPrefixedNameOrKeyWord(Token token2)
    {
        long posn = inputStream.getPosition() ;
        token2.setImage(readWord(false)) ;
        token2.setType(TokenType.KEYWORD) ;
        int ch = inputStream.peekByte() ;
        if ( ch == B_COLON )
        {
            inputStream.readByte() ;
            token2.setType(TokenType.PREFIXED_NAME) ;
            String ln = readLocalPart() ;
            token2.setImage2(ln) ;
            if ( Checking ) checkPrefixedName(token2.getImage(), token2.getImage2()) ;
        }

        // If we made no progress, nothing found, not even a keyword -- it's an error.
        if ( posn == inputStream.getPosition() ) 
            exception(String.format("Unknown char: %c(%d)",ch,ch)) ;

        if ( Checking ) checkKeyword(token2.getImage()) ;
       
    }
   
    private String readLongString(int quoteChar, boolean endNL)
    {
        stringBuilder.setLength(0) ;
        for ( ;; )
        {
            int ch = inputStream.readByte() ;
            if ( ch == EOF )
            {
                if ( endNL ) return stringBuilder.toString() ;
                exception("Broken long string") ;
            }
           
            if ( ch == quoteChar )
            {
                if ( threeQuotes(quoteChar) )
                    return stringBuilder.toString() ;
            }
           
            if ( ch == '\\' )
                ch = readLiteralEscape() ;
            insertCodepoint(stringBuilder, ch) ;
        }
    }

    // Need "readByteOrEscape"
   
    private String readLocalPart()
    { return readWordSub(true, true) ; }
   
    private String readWord(boolean leadingDigitAllowed)
    { return readWordSub(leadingDigitAllowed, false) ; }
   
    private String readWordSub(boolean leadingDigitAllowed, boolean leadingSignAllowed)
    {
        stringBuilder.setLength(0) ;
        int idx = 0 ;
        if ( ! leadingDigitAllowed )
        {
            int ch = inputStream.peekByte() ;
            if ( Character.isDigit(ch) )
                return "" ;
        }
        if ( ! leadingSignAllowed )
        {
            int ch = inputStream.peekByte() ;
            if ( ch == '-' || ch == '+' )
                return "" ;
        }
       
        for ( ;; idx++ )
        {
            int ch = inputStream.peekByte() ;
           
            if ( Character.isLetterOrDigit(ch) || ch == '_' || ch == '.' || ch == '-'  ) // ||  ch == '#' || ch == '/' )
            {
                inputStream.readByte() ;
                // UTF-8
                int ch2 = InStreamUTF8.advance(inputStream.getInput()) ;
                stringBuilder.append((char)ch2) ;
                continue ;
            }
            else
                break ;
           
        }
        // BAD : assumes pushbackChar is infinite.
        // Check is ends in "."
        while ( idx > 0 && stringBuilder.charAt(idx-1) == B_DOT )
        {
            // Push back the dot.
            inputStream.pushbackByte(B_DOT) ;
            stringBuilder.setLength(idx-1) ;
            idx -- ;
        }
        return stringBuilder.toString() ;
    }

    // Need "readByteOrEscape"
   
    // Assume have read the first quote char.
    // On return:
    //   If false, have moved over no more characters (due to pushbacks)
    //   If true, at end of 3 quotes
    private boolean threeQuotes(int ch)
    {
        //inputStream.readByte() ;         // Read first quote.
        int ch2 = inputStream.peekByte() ;
        if (ch2 != ch )
        {
            //inputStream.pushbackChar(ch2) ;
            return false ;
        }
       
        inputStream.readByte() ;         // Read second quote.
        int ch3 = inputStream.peekByte() ;
        if ( ch3 != ch )
        {
            inputStream.pushbackByte(ch2) ;
            return false ;
        }
           
        // Three quotes.
        inputStream.readByte() ;         // Read third quote.
        return true ;
    }

    // Make better!
    /*
    [16]    integer         ::=     ('-' | '+') ? [0-9]+
    [17]    double          ::=     ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent )
                                    0.e0, .0e0, 0e0
    [18]    decimal         ::=     ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ )
                                    0.0 .0
    [19]    exponent        ::=     [eE] ('-' | '+')? [0-9]+
    []      hex             ::=     0x0123456789ABCDEFG
   
    */
    private void readNumber()
    {
        // One entry, definitely a number.
        // Beware of '.' as a (non) decimal.
        /*
        maybeSign()
        digits()
        if dot ==> decimal, digits
        if e   ==> double, maybeSign, digits
        else
            check not "." for decimal.
        */
        boolean isDouble = false ;
        boolean isDecimal = false ;
        stringBuilder.setLength(0) ;
       
        int x = 0 ; // Digits before a dot.
        int ch = inputStream.peekByte() ;
        if ( ch == '0' )
        {
            x++ ;
            inputStream.readByte() ;
            // Digit 0
            stringBuilder.append((char)ch) ;
            ch = inputStream.peekByte() ;
            if ( ch == 'x' || ch == 'X' )
            {
                inputStream.readByte() ;
                stringBuilder.append((char)ch) ;
                readHex(inputStream, stringBuilder) ;
                token.setImage(stringBuilder.toString()) ;
                token.setType(TokenType.HEX) ;
                return ;
            }
        }
        else if ( ch == '-' || ch == '+' )
        {
            readPossibleSign(stringBuilder) ;
        }
       
       
        x += readDigits(stringBuilder) ;
//        if ( x == 0 )
//        {
//           
//        }
        ch = inputStream.peekByte() ;
        if ( ch == B_DOT )
        {
            inputStream.readByte() ;
            stringBuilder.append(B_DOT) ;
            isDecimal = true // Includes things that will be doubles.
            readDigits(stringBuilder) ;
        }
       
        if ( x == 0 && ! isDecimal )
            // Possible a tokenizer error - should not have entered readNumber in the first place.
            exception("Unrecognized as number") ;
       
        if ( exponent(stringBuilder) )
        {
            isDouble = true ;
            isDecimal = false ;
           
        }
       
        token.setImage(stringBuilder.toString()) ;
        if ( isDouble )
            token.setType(TokenType.DOUBLE) ;
        else if ( isDecimal )
            token.setType(TokenType.DECIMAL) ;
        else
            token.setType(TokenType.INTEGER) ;
    }

   
    private static void readHex(PeekInputStream inputStream, StringBuilder sb)
    {
        // Just after the 0x, which are in sb
        int x = 0 ;
        for(;;)
        {
            int ch = inputStream.peekByte() ;

            if ( ! range(ch, '0', '9') && ! range(ch, 'a', 'f') && ! range(ch, 'A', 'F') )
                break ;
            inputStream.readByte() ;
            // Less than codepoint 128
            sb.append((char)ch) ;
            x++ ;
        }
        if ( x == 0 )
            exception(inputStream, "No hex characters after "+sb.toString()) ;
    }

    private boolean exponent(StringBuilder sb)
    {
        int ch = inputStream.peekByte() ;
        if ( ch != 'e' && ch != 'E' )
            return false ;
        inputStream.readByte() ;
        sb.append((char)ch) ;
        readPossibleSign(sb) ;
        int x = readDigits(sb) ;
        if ( x == 0 )
            exception("Malformed double: "+sb) ;
        return true ;
    }

    private void readPossibleSign(StringBuilder sb)
    {
        int ch = inputStream.peekByte() ;
        if ( ch == '-' || ch == '+' )
        {
            inputStream.readByte() ;
            sb.append((char)ch) ;
        }
    }

    private int readDigits(StringBuilder buffer)
    {
        int count = 0 ;
        for(;;)
        {
            int ch = inputStream.peekByte() ;
            if ( ! range(ch, '0', '9' ) )
                break ;
            inputStream.readByte() ;
            // Less than code point 128
            buffer.append((char)ch) ;
            count ++ ;
        }
        return count ;
    }
   
    private String langTag()
    {
        stringBuilder.setLength(0) ;
        a2z(stringBuilder) ;
        if ( stringBuilder.length() == 0 )
            exception("Bad language tag") ;
        for ( ;; )
        {
            int ch = inputStream.peekByte() ;
            if ( ch == '-' )
            {
                inputStream.readByte() ;
                stringBuilder.append('-') ;
                int x = stringBuilder.length();
                a2zN(stringBuilder) ;
                if ( stringBuilder.length() == x )
                    exception("Bad language tag") ;
            }
            else
                break ;
        }
        return stringBuilder.toString();
    }
   
    private void a2z(StringBuilder sb2)
    {
        for ( ;; )
        {
            int ch = inputStream.peekByte() ;
            if ( isA2Z(ch) )
            {
                inputStream.readByte() ;
                // Less than codepoint 128
                stringBuilder.append((char)ch) ;
            }
            else
                return ;
        }
    }
   
    private void a2zN(StringBuilder sb2)
    {
        for ( ;; )
        {
            int ch = inputStream.peekByte() ;
            if ( isA2ZN(ch) )
            {
                inputStream.readByte() ;
                stringBuilder.append((char)ch) ;
            }
            else
                return ;
        }
    }

    // Blank node label: A-Z,a-z0-9 and '-'
    // Also possible: skip to space or EOF
    private String readBlankNodeLabel()
    {
        stringBuilder.setLength(0) ;
        // First character.
        {
            int ch = inputStream.peekByte() ;
            if ( ch == EOF )
                exception("Blank node label missing (EOF found)") ;
            if ( isWhitespace(ch))
                exception("Blank node label missing") ;
            //if ( ! isAlpha(ch) && ch != '_' )
            // Not strict
            if ( ! isAlphaNumeric(ch) && ch != '_' )
                exception("Blank node label does not start with alphabetic or _ :"+(char)ch) ;
            inputStream.readByte() ;
            stringBuilder.append((char)ch) ;
        }
        // Remainder.
        for(;;)
        {
            int ch = inputStream.peekByte() ;
            if ( ch == EOF )
                break ;
            if ( ! isAlphaNumeric(ch) && ch != '-' && ch != '_' )
                break ;
            inputStream.readByte() ;
            stringBuilder.append((char)ch) ;
        }
//        if ( ! seen )
//            exception("Blank node label missing") ;
        return stringBuilder.toString() ;
    }

   
    // Get characters between two markers.
    // strEscapes may be processed
    // endNL end of line as an ending is OK
    private String allBetween(int startCh, int endCh,
                              boolean strEscapes, boolean endNL)
    {
        long y = getLine() ;
        long x = getColumn() ;
        stringBuilder.setLength(0) ;

        // Assumes first char read already.
//        int ch0 = inputStream.readByte() ;
//        if ( ch0 != startCh )
//            exception("Broken parser", y, x) ;

       
        for(;;)
        {
            int ch = inputStream.readByte() ;
            if ( ch == EOF )
            {
                if ( endNL ) return stringBuilder.toString() ;
                exception("Broken token: "+stringBuilder.toString(), y, x) ;
            }

            if ( ch == '\n' )
                exception("Broken token (newline): "+stringBuilder.toString(), y, x) ;
           
            if ( ch == endCh )
            {
                //sb.append(((char)ch)) ;
                return stringBuilder.toString() ;
            }
           
            if ( ch == '\\' )
            {
                if ( strEscapes )
                    ch = readLiteralEscape() ;
                else
                    ch = readUnicodeEscape() ;
                insertCodepoint(stringBuilder, ch) ;
                continue ;
            }
            // Not special.
            insertChar(stringBuilder, ch) ;
        }
    }
   
    // Insert character, knowing the first byte.
    private void insertChar(StringBuilder buffer, int first)
    {
        int ch2 = InStreamUTF8.advance(inputStream.getInput(), first) ;
        insertCodepoint(buffer, ch2) ;
    }

    // ch is already a unicode codepoint
    private void insertCodepoint(StringBuilder buffer, int ch)
    {
        if ( Character.charCount(ch) == 1 )
            buffer.append((char)ch) ;
        else
        {
            // Convert to UTF-16.  Note that the rest of any system this is used
            // in must also respect codepoints and surrogate pairs.
            if ( ! Character.isDefined(ch) && ! Character.isSupplementaryCodePoint(ch) )
                exception(String.format("Illegal codepoint: 0x%04X", ch)) ;
            char[] chars = Character.toChars(ch) ;
            buffer.append(chars) ;
        }
    }

   
   
    @Override
    public long getColumn()
    {
        return inputStream.getColNum() ;
    }

    @Override
    public long getLine()
    {
        return inputStream.getLineNum() ;
    }

    // ---- Routines to check tokens
   
    private void checkBlankNode(String blankNodeLabel)
    {
        if ( checker != null ) checker.checkBlankNode(blankNodeLabel) ;
    }

    private void checkLiteralLang(String lexicalForm, String langTag)
    {
       if ( checker != null ) checker.checkLiteralLang(lexicalForm, langTag) ;
    }

    private void checkLiteralDT(String lexicalForm, Token datatype)
    {
       if ( checker != null ) checker.checkLiteralDT(lexicalForm, datatype) ;
    }

    private void checkString(String string)
    {
       if ( checker != null ) checker.checkString(string) ;
    }

    private void checkURI(String uriStr)
    {
       if ( checker != null ) checker.checkURI(uriStr) ;
    }

    private void checkNumber(String image, String datatype)
    {
       if ( checker != null ) checker.checkNumber(image, datatype) ;
    }

    private void checkVariable(String tokenImage)
    {
       if ( checker != null ) checker.checkVariable(tokenImage) ;
    }

    private void checkDirective(int cntrlCode)
    {
       if ( checker != null ) checker.checkDirective(cntrlCode) ;
    }

    private void checkKeyword(String tokenImage)
    {
       if ( checker != null ) checker.checkKeyword(tokenImage) ;
    }

    private void checkPrefixedName(String tokenImage, String tokenImage2)
    {
       if ( checker != null ) checker.checkPrefixedName(tokenImage, tokenImage2) ;
    }

    private void checkControl(int code)
    {
       if ( checker != null ) checker.checkControl(code) ;
    }

    // ---- Escape sequences
   
    private final
    int readLiteralEscape()
    {
        int c = inputStream.readByte();
        if ( c==EOF )
            exception("Escape sequence not completed") ;

        switch (c)
        {
            case 'n':   return NL ;
            case 'r':   return CR ;
            case 't':   return '\t' ;
            case '"':   return '"' ;
            case '\''return '\'' ;
            case '\\'return '\\' ;
            case 'u':   return readUnicode4Escape();
            case 'U':   return readUnicode8Escape();
            default:
                exception(String.format("illegal escape sequence value: %c (0x%02X)", c, c));
                return 0 ;
        }
    }
   
   
    private final
    int readUnicodeEscape()
    {
        int ch = inputStream.readByte() ;
        if ( ch == EOF )
            exception("Broken escape sequence") ;

        switch (ch)
        {
            case 'u': return readUnicode4Escape();
            case 'U': return readUnicode8Escape();
            default:
                exception(String.format("illegal escape sequence value: %c (0x%02X)", ch, ch));
        }
        return 0 ;
    }
   
    private final
    int readUnicode4Escape() { return readUnicodeEscape(4) ; }
   
    private final
    int readUnicode8Escape()
    {
        int ch8 = readUnicodeEscape(8) ;
        if ( ch8 > Character.MAX_CODE_POINT )
            exception(String.format("illegal code point in \\U sequence value: 0x%08X", ch8));
        return ch8 ;
    }
   
    private final
    int readUnicodeEscape(int N)
    {
        int x = 0 ;
        for ( int i = 0 ; i < N ; i++ )
        {
            int d = readHexChar() ;
            if ( d < 0 )
                return -1 ;
            x = (x<<4)+d ;
        }
        return x ;
    }
   
    private final
    int readHexChar()
    {
        int ch = inputStream.readByte() ;
        if ( ch == EOF )
            exception("Not a hexadecimal character (end of file)") ;

        int x =  valHexChar(ch) ;
        if ( x != -1 )
            return x ;
        exception("Not a hexadecimal character: "+(char)ch) ;
        return -1 ;
    }
   
    private boolean expect(String str) {
        for (int i = 0; i < str.length(); i++) {
            char want = str.charAt(i);
            if (inputStream.eof())
            {
                exception("End of input during expected string: "+str) ;
                return false ;
            }
            int inChar = inputStream.readByte();
            if (inChar != want) {
                //System.err.println("N-triple inputStream error");
                exception("expected \"" + str + "\"");
                return false;
            }
        }
        return true;
    }

    private void exception(String message)
    {
        exception(message, inputStream.getLineNum(), inputStream.getColNum()) ;
    }
   
    private static void exception(PeekInputStream inputStream, String message)
    {
        exception(message, inputStream.getLineNum(), inputStream.getColNum()) ;
    }

    private static void exception(String message, long line, long col)
    {
        throw new RiotParseException(message, line, col) ;
    }
}
TOP

Related Classes of org.openjena.riot.tokens.TokenizerBytes

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.