Source Code of r.builtins.Scan

package r.builtins;


import java.io.*;
import java.util.*;


import r.*;
import r.Convert.ConversionStatus;
import r.Console;
import r.data.*;
import r.data.RComplex.Complex;
import r.data.internal.*;
import r.data.internal.Connection.FileConnection;
import r.errors.*;
import r.nodes.ast.*;
import r.nodes.exec.*;
import r.runtime.*;


/**
 * "scan"
 *
 * <pre>
 * file -- the name of a file to read data values from. If the specified file is "", then input
 *     is taken from the keyboard (or whatever stdin() reads if input is redirected or R is embedded).
 *     (In this case input can be terminated by a blank line or an EOF signal, Ctrl-D on Unix and Ctrl-Z
 *     on Windows.)Otherwise, the file name is interpreted relative to the current working directory
 *     (given by getwd()), unless it specifies an absolute path. Tilde-expansion is performed where
 *     supported. When running R from a script, file="stdin" can be used to refer to the process's stdin file stream.
 *     As from R 2.10.0 this can be a compressed file (see file). Alternatively, file can be a connection,
 *     which will be opened if necessary, and if so closed at the end of the function call. Whatever mode
 *     the connection is opened in, any of LF, CRLF or CR will be accepted as the EOL marker for a line and so will match sep = "\n".
 *     file can also be a complete URL. (For the supported URL schemes, see the URLs section of the help for url.)
 *     To read a data file not in the current encoding (for example a Latin-1 file in a UTF-8 locale or conversely) use a
 *     file connection setting its encoding argument (or scan's fileEncoding argument).
 * what -- the type of what gives the type of data to be read. The supported types are logical, integer, numeric, complex,
 *     character, raw and list. If what is a list, it is assumed that the lines of the data file are records each containing
 *     length(what) items (fields) and the list components should have elements which are one of the first six types
 *     listed or NUL
 * nmax -- integer: the maximum number of data values to be read, or if what is a list, the maximum number of records
 *     to be read. If omitted or not positive or an invalid value for an integer (and nlines is not set to
 *     a positive value), scan will read to the end of file.
 * n -- integer: the maximum number of data values to be read, defaulting to no limit. Invalid values will be ignored.
 * sep -- by default, scan expects to read white-space delimited input fields. Alternatively, sep can be used to specify
 *     a character which delimits fields. A field is always delimited by an end-of-line marker unless it is quoted.
 *     If specified this should be the empty character string (the default) or NULL or a character string containing
 *     just one single-byte character.
 * quote -- the set of quoting characters as a single character string or NULL. In a multibyte locale the quoting
 *     characters must be ASCII (single-byte).
 * dec -- decimal point character. This should be a character string containing just one single-byte character.
 *     (NULL and a zero-length character vector are also accepted, and taken as the default.)
 * skip -- the number of lines of the input file to skip before beginning to read data values.
 * nlines -- if positive, the maximum number of lines of data to be read.
 * na.strings -- character vector. Elements of this vector are to be interpreted as missing (NA) values. Blank
 *      fields are also considered to be missing values in logical, integer, numeric and complex fields.
 * flush -- logical: if TRUE, scan will flush to the end of the line after reading the last of the fields requested.
 *     This allows putting comments after the last field, but precludes putting more that one record on a line.
 * fill -- logical: if TRUE, scan will implicitly add empty fields to any lines with fewer fields than implied by what.
 * strip.white -- vector of logical value(s) corresponding to items in the what argument. It is used only when sep has been
 *      specified, and allows the stripping of leading and trailing 'white space' from character fields (numeric fields are always stripped).
 *      Note: white space inside quoted strings is not stripped. If strip.white is of length 1, it applies to all fields; otherwise, if
 *      strip.white[i] is TRUE and the i-th field is of mode character (because what[i] is) then the leading and trailing
 *      unquoted white space from field i is stripped.
 * quiet -- logical: if FALSE (default), scan() will print a line, saying how many items have been read.
 * blank.lines.skip -- logical: if TRUE blank lines in the input are ignored, except when counting skip and nlines.
 * multi.line -- logical. Only used if what is a list. If FALSE, all of a record must appear on one line
 *      (but more than one record can appear on a single line). Note that using fill = TRUE implies that a record
 *      will be terminated at the end of a line.
 * comment.char -- character: a character vector of length one containing a single character or an empty string.
 *      Use "" to turn off the interpretation of comments altogether (the default).
 * allowEscapes -- logical. Should C-style escapes such as \n be processed (the default) or read verbatim? Note that
 *      if not within quotes these could be interpreted as a delimiter (but not as a comment character).
 *      The escapes which are interpreted are the control characters \a, \b, \f, \n, \r, \t, \v and octal and hexadecimal
 *      representations like \ 040 and \ 0x2A. Any other escaped character is treated as itself, including backslash. Note that
 *      Unicode escapes (starting \ u or \ U: see Quotes) are never processed.
 * fileEncoding -- character string: if non-empty declares the encoding used on a file (not a connection nor the keyboard) so
 *      the character data can be re-encoded.
 * encoding -- encoding to be assumed for input strings. If the value is "latin1" or "UTF-8" it is used to mark character
 *      strings as known to be in Latin-1 or UTF-8: it is not used to re-encode the input
 * text -- character string: if file is not supplied and this is, then data are read from the value of text via a text connection.
 * </pre>
 */
// TODO: a very incomplete implementation, the full method has 21 parameters
// note also that some of the current implementation will likely have to be rewritten for the full set of features
final class Scan extends CallFactory {


    static final CallFactory _ = new Scan("scan", new String[]{"file", "what", "nmax", "quiet"}, new String[]{});


    private Scan(String name, String[] params, String[] required) {
        super(name, params, required);
    }


    @Override public RNode create(ASTNode call, RSymbol[] names, RNode[] exprs) {
        ArgumentInfo ia = check(call, names, exprs);
        final ConnectionMode defaultMode = ConnectionMode.get("r");
        final int posWhat = ia.position("what");
        final int posNmax = ia.position("nmax");
        final int posQuiet = ia.position("quiet");
        final int posFile = ia.position("file");
        return new Builtin(call, names, exprs) {
            @Override public RAny doBuiltIn(Frame frame, RAny[] args) {
                RAny what = posWhat != -1 ? args[posWhat] : RDouble.EMPTY;
                int nmax = posNmax != -1 ? parseNMax(args[posNmax], ast) : -1;
                boolean quiet = posQuiet != -1 ? parseQuiet(args[posQuiet]) : false;


                if (what instanceof RList) { throw Utils.nyi("list not yet implemented"); }
                if (what instanceof RNull) { throw RError.getInvalidArgument(ast, "what"); }
                Connection con = null;
                boolean wasOpen = false;


                if (posFile == -1) { throw Utils.nyi("stdin, text"); } // FIXME: similar code to that in readLines, extract?
                RAny conArg = args[posFile];
                if (conArg instanceof RString) {
                    String description = File.getScalarString(conArg, ast, "file");
                    con = FileConnection.createOpened(description, defaultMode, ast);
                } else if (conArg instanceof RInt) {
                    // FIXME: check if it is a connection once attributes are implemented
                    RInt iarg = (RInt) conArg;
                    if (iarg.size() != 1) { throw RError.getNotConnection(ast, "file"); }
                    int handle = iarg.getInt(0);
                    con = RContext.getConnection(handle);
                    Utils.check(con != null);
                    if (con.isOpen()) {
                        ConnectionMode mode = con.currentMode();
                        if (!mode.read()) { throw RError.getCannotReadConnection(ast); }
                        wasOpen = true;
                    } else {
                        con.open(defaultMode, ast);
                    }
                }
                try {
                    // TODO: replace this primitive scanning by something more general
                    // note that we cannot simply use Scan because it would buffer too much data (Scan cannot push its remaining buffered data back to the
                    // underlying BufferedReader ; probably will have to implement a custom BufferedScanner for R
                    Reader reader = con.reader(ast);
                    ArrayList<String> buf = new ArrayList<>();
                    int nread = 0;
                    StringBuilder item = null;
                    int c;
                    c = skip_whites(reader);
                    if (c != -1) {
                        for (;;) {
                            if (is_white(c)) {
                                if (item != null) {
                                    buf.add(item.toString());
                                    nread++;
                                }
                                if (nread == nmax) {
                                    break;
                                }
                                c = skip_whites(reader);
                                item = null; // TODO: get rid of allocation
                                continue;
                            }
                            if (c == -1) {
                                if (item != null) {
                                    buf.add(item.toString());
                                    nread++;
                                }
                                break;
                            }
                            if (item == null) {
                                item = new StringBuilder();
                            }
                            item.append((char) c);
                            c = reader.read();
                        }
                    }
                    RAny res = scan(buf, ast, what);
                    if (!quiet) {
                        Console.println(String.format("Read %d item%s.", nread, nread == 1 ? "" : "s"));
                    }
                    return res;
                } catch (IOException e) {
                    throw RError.getGenericError(ast, e.toString());
                } finally {
                    if (!wasOpen) {
                        con.close(ast);
                    }
                }
            }
        };
    }


    public static int parseNMax(RAny arg, ASTNode ast) {
        RInt narg = Convert.coerceToIntWarning(arg, ast);
        if (narg.size() >= 1) { return narg.getInt(0); }
        return RInt.NA;
    }


    public static boolean parseQuiet(RAny arg) {
        RLogical narg = arg.asLogical();
        if (narg.size() >= 1) { return narg.getLogical(0) == RLogical.TRUE; }
        return false;
    }


    private static ConversionStatus cs = new ConversionStatus();


    public static RString scanString(ArrayList<String> src, @SuppressWarnings("unused") ASTNode ast) {
        int size = src.size();
        String[] content = new String[size];
        return RString.RStringFactory.getFor(src.toArray(content));
    }


    public static RComplex scanComplex(ArrayList<String> src, ASTNode ast) {
        int size = src.size();
        cs.naIntroduced = false;
        double[] content = new double[2 * size];
        for (int i = 0; i < size; i++) {
            String str = src.get(i);
            Complex c = Convert.string2complex(str, cs);
            if (cs.naIntroduced) {
                if (str.equals("NA")) {
                    cs.naIntroduced = false;
                } else {
                    throw RError.getScanUnexpected(ast, "a complex", str);
                }
            }
            content[2 * i] = c.realValue();
            content[2 * i + 1] = c.imagValue();
        }
        return RComplex.RComplexFactory.getFor(content);
    }


    public static RDouble scanDouble(ArrayList<String> src, ASTNode ast) {
        int size = src.size();
        cs.naIntroduced = false;
        double[] content = new double[size];
        for (int i = 0; i < size; i++) {
            String str = src.get(i);
            content[i] = Convert.string2double(str, cs);
            if (cs.naIntroduced) {
                if (str.equals("NA")) {
                    cs.naIntroduced = false;
                } else {
                    throw RError.getScanUnexpected(ast, "a real", str); }
                }
        }
        return RDouble.RDoubleFactory.getFor(content);
    }


    public static RInt scanInt(ArrayList<String> src, ASTNode ast) {
        int size = src.size();
        cs.naIntroduced = false;
        int[] content = new int[size];
        for (int i = 0; i < size; i++) {
            String str = src.get(i);
            content[i] = Convert.string2int(str, cs);
            if (cs.naIntroduced) {
                if (str.equals("NA")) {
                    cs.naIntroduced = false;
                } else {
                    throw RError.getScanUnexpected(ast, "an integer", str);
                }
            }
        }
        return RInt.RIntFactory.getFor(content);
    }


    public static RLogical scanLogical(ArrayList<String> src, ASTNode ast) {
        int size = src.size();
        cs.naIntroduced = false;
        int[] content = new int[size];
        for (int i = 0; i < size; i++) {
            String str = src.get(i);
            content[i] = Convert.string2logical(str, cs);
            if (cs.naIntroduced) {
                if (str.equals("NA")) {
                    cs.naIntroduced = false;
                } else {
                    throw RError.getScanUnexpected(ast, "a logical", str);
                }
            }
        }
        return RLogical.RLogicalFactory.getFor(content);
    }


    static int hexDigit(char c) {
        if (c >= '0' && c <= '9') {
            return c - '0';
        }
        if (c >= 'a' && c <= 'f') {
            return c - 'a' + 10;
        }
        if (c >= 'A' && c <= 'F') {
            return c - 'A' + 10;
        }
        return -1;
    }


    public static RRaw scanRaw(ArrayList<String> src, ASTNode ast) {
        int size = src.size();
        cs.naIntroduced = false;
        cs.outOfRange = false;
        byte[] content = new byte[size];
        for (int i = 0; i < size; i++) {
            String str = src.get(i);


            // conversion taken from GNU-R (yes, a bit strange)
            if (str.equals("NA")) {
                // zero
                continue;
            }
            int slen = str.length();
            if (slen == 2) {
                int a = hexDigit(str.charAt(0));
                int b = hexDigit(str.charAt(1));
                if (a != -1 && b != -1) {
                    content[i] = (byte) (a * 16 + b);
                    continue;
                }
            }
            if (slen == 1) {
                if (hexDigit(str.charAt(0)) != -1) {
                    // zero;
                    continue;
                }
            }
            throw RError.getScanUnexpected(ast, "a raw", str);
        }
        return RRaw.RRawFactory.getFor(content);
    }


    public static RAny scan(ArrayList<String> src, ASTNode ast, RAny what) {
        if (what instanceof RString) { return scanString(src, ast); }
        if (what instanceof RDouble) { return scanDouble(src, ast); }
        if (what instanceof RInt) { return scanInt(src, ast); }
        if (what instanceof RLogical) { return scanLogical(src, ast); }
        if (what instanceof RRaw) { return scanRaw(src, ast); }
        if (what instanceof RComplex) { return scanComplex(src, ast); }
        throw RError.getInvalidArgument(ast, "what");
    }


    public static boolean is_white(int c) {
        return c == '\r' || c == '\n' || c == '\t' || c == ' ';
    }


    public static int skip_whites(Reader reader) throws IOException {
        int c;
        for (;;) {
            c = reader.read();
            if (!is_white(c)) { return c;


            }
        }
    }


}
Source Code of r.builtins.Scan

Related Classes of r.builtins.Scan