package water.parser;
import water.DKV;
import water.H2O;
import water.Iced;
import water.Key;
import java.util.HashSet;
/**
* Configuration and base guesser for a parse;
*/
public final class ParseSetup extends Iced {
static final byte AUTO_SEP = -1;
Key[] _srcs; // Source Keys being parsed
int _checkHeader; // 1st row: 0: guess, +1 header, -1 data
// Whether or not single-quotes quote a field. E.g. how do we parse:
// raw data: 123,'Mally,456,O'Mally
// singleQuotes==True ==> 2 columns: 123 and Mally,456,OMally
// singleQuotes==False ==> 4 columns: 123 and 'Mally and 456 and O'Mally
boolean _singleQuotes;
String _hexName; // Cleaned up result Key suggested name
ParserType _pType; // CSV, XLS, XSLX, SVMLight, Auto, ARFF
byte _sep; // Field separator, usually comma ',' or TAB or space ' '
int _ncols; // Columns to parse
String[] _columnNames;
String[][] _domains; // Domains for each column (null if numeric)
byte[] _ctypes; // Column types using types defined by FVecDataOut (UCOL, NCOL, etc).
String[][] _data; // First few rows of parsed/tokenized data
boolean _isValid; // The initial parse is sane
String[] _errors; // Errors in this parse setup
long _invalidLines; // Number of broken/invalid lines found
long _headerlines; // Number of broken/invalid lines found
public ParseSetup(boolean isValid, long invalidLines, long headerlines, String[] errors, ParserType t, byte sep, int ncols, boolean singleQuotes, String[] columnNames, String[][] domains, String[][] data, int checkHeader, byte[] ctypes) {
_isValid = isValid;
_invalidLines = invalidLines;
_headerlines = headerlines;
_errors = errors;
_pType = t;
_sep = sep;
_ncols = ncols;
_singleQuotes = singleQuotes;
_columnNames = columnNames;
_domains = domains;
_data = data;
_checkHeader = checkHeader;
_ctypes = ctypes;
}
// Invalid setup based on a prior valid one
ParseSetup(ParseSetup ps, String err) {
this(false, ps._invalidLines, ps._headerlines, new String[]{err}, ps._pType, ps._sep, ps._ncols, ps._singleQuotes, ps._columnNames, ps._domains, ps._data, ps._checkHeader, null);
}
// Called from Nano request server with a set of Keys, produce a suitable parser setup guess.
public ParseSetup() {
}
final boolean hasHeaders() { return _columnNames != null; }
final long headerLines() { return _headerlines; }
public Parser parser() {
switch( _pType ) {
case CSV: return new CsvParser(this);
case XLS: return new XlsParser(this);
case SVMLight: return new SVMLightParser(this);
case ARFF: return new ARFFParser(this);
}
throw H2O.fail();
}
// Set of duplicated column names
HashSet<String> checkDupColumnNames() {
HashSet<String> conflictingNames = new HashSet<>();
if( _columnNames==null ) return conflictingNames;
HashSet<String> uniqueNames = new HashSet<>();
for( String n : _columnNames )
(uniqueNames.contains(n) ? conflictingNames : uniqueNames).add(n);
return conflictingNames;
}
@Override public String toString() {
if (_errors != null) {
StringBuilder sb = new StringBuilder();
for (String e : _errors) sb.append(e).append("\n");
return sb.toString();
}
return _pType.toString( _ncols, _sep );
}
static boolean allStrings(String [] line){
ValueString str = new ValueString();
for( String s : line ) {
try {
Double.parseDouble(s);
return false; // Number in 1st row guesses: No Column Header
} catch (NumberFormatException e) { /*Pass - determining if number is possible*/ }
if( ParseTime.attemptTimeParse(str.setTo(s)) != Long.MIN_VALUE ) return false;
ParseTime.attemptUUIDParse0(str.setTo(s));
ParseTime.attemptUUIDParse1(str);
if( str.get_off() != -1 ) return false; // Valid UUID parse
}
return true;
}
// simple heuristic to determine if we have headers:
// return true iff the first line is all strings and second line has at least one number
static boolean hasHeader(String[] l1, String[] l2) {
return allStrings(l1) && !allStrings(l2);
}
// Guess everything from a single pile-o-bits. Used in tests, or in initial
// parser inspections when the user has not told us anything about separators
// or headers.
public static ParseSetup guessSetup( byte[] bits, boolean singleQuotes, int checkHeader ) {
return guessSetup(bits, ParserType.AUTO, AUTO_SEP, -1, singleQuotes, checkHeader, null, null);
}
private static final ParserType guessTypeOrder[] = {ParserType.ARFF, ParserType.XLS,ParserType.XLSX,ParserType.SVMLight,ParserType.CSV};
public static ParseSetup guessSetup( byte[] bits, ParserType pType, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, String[][] domains ) {
switch( pType ) {
case CSV: return CsvParser.CSVguessSetup(bits,sep,ncols,singleQuotes,checkHeader,columnNames);
case SVMLight: return SVMLightParser. guessSetup(bits);
case XLS: return XlsParser. guessSetup(bits);
case ARFF: return ARFFParser. guessSetup(bits, sep, ncols, singleQuotes, checkHeader, columnNames);
case AUTO:
for( ParserType pType2 : guessTypeOrder ) {
try {
ParseSetup ps = guessSetup(bits,pType2,sep,ncols,singleQuotes,checkHeader,columnNames,domains);
if( ps != null && ps._isValid ) return ps;
} catch( Throwable ignore ) { /*ignore failed parse attempt*/ }
}
}
return new ParseSetup( false, 0, 0, new String[]{"Cannot determine file type"}, pType, sep, ncols, singleQuotes, columnNames, domains, null, checkHeader, null);
}
// Guess a local setup that is compatible to the given global (this) setup.
// If they are not compatible, there will be _errors set.
ParseSetup guessSetup( byte[] bits, int checkHeader ) {
assert _isValid;
ParseSetup ps = guessSetup(bits, _singleQuotes, checkHeader);
if( !ps._isValid ) return ps; // Already invalid
// ARFF wins over CSV (Note: ARFF might not know separator or ncols yet)
if (_pType == ParserType.CSV && ps._pType == ParserType.ARFF) {
if (ps._sep == ParseSetup.AUTO_SEP && _sep != ParseSetup.AUTO_SEP) ps._sep = _sep; //use existing separator
return ps;
}
if (_pType == ParserType.ARFF && ps._pType == ParserType.CSV) {
if (ps._sep != ParseSetup.AUTO_SEP && _sep == ParseSetup.AUTO_SEP) _sep = ps._sep; //use existing separator
return this;
}
if( _pType != ps._pType || ( (_pType == ParserType.CSV && (_sep != ps._sep || _ncols != ps._ncols)) || (_pType == ParserType.ARFF && (_sep != ps._sep || _ncols != ps._ncols)) ) )
return new ParseSetup(ps,"Conflicting file layouts, expecting: "+this+" but found "+ps+"\n");
return ps;
}
protected static String hex( String n ) {
// blahblahblah/myName.ext ==> myName
// blahblahblah/myName.csv.ext ==> myName
int sep = n.lastIndexOf(java.io.File.separatorChar);
if( sep > 0 ) n = n.substring(sep+1);
int dot = n.lastIndexOf('.');
if( dot > 0 ) n = n.substring(0, dot);
int dot2 = n.lastIndexOf('.');
if( dot2 > 0 ) n = n.substring(0, dot2);
// "2012_somedata" ==> "X2012_somedata"
if( !Character.isJavaIdentifierStart(n.charAt(0)) ) n = "X"+n;
// "human%Percent" ==> "human_Percent"
char[] cs = n.toCharArray();
for( int i=1; i<cs.length; i++ )
if( !Character.isJavaIdentifierPart(cs[i]) )
cs[i] = '_';
// "myName" ==> "myName.hex"
n = new String(cs);
int i = 0;
String res = n + ".hex";
Key k = Key.make(res);
// Renumber to handle dup names
while(DKV.get(k) != null)
k = Key.make(res = n + ++i + ".hex");
return res;
}
} // ParseSetup state class