Package it.unimi.dsi.mg4j.test

Source Code of it.unimi.dsi.mg4j.test.SelectStats

package it.unimi.dsi.mg4j.test;

import it.unimi.dsi.mg4j.index.DiskBasedIndex;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.util.Properties;

import java.io.FileReader;
import java.io.IOException;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/** Selects part of a stats using global frequency.
*/

final public class SelectStats {
  @SuppressWarnings("unused")
  private final static Logger LOGGER = Util.getLogger( SelectStats.class );
 
  private SelectStats() {}

  /** A reasonable format for real numbers. */
  private static final java.text.NumberFormat formatDouble = new java.text.DecimalFormat( "#,##0.00000" );
 
  /** Formats a number.
   *
   * <P>This method formats a double separating thousands and printing just two fractional digits.
   * @param d a number.
   * @return a string containing a pretty print of the number.
   */
  public static String format( final double d ) {
    final StringBuffer s = new StringBuffer();
    return formatDouble.format( d, s, new java.text.FieldPosition( 0 ) ).toString();
  }
 

  public static void main( final String[] arg ) throws IOException, JSAPException, ConfigurationException {

    SimpleJSAP jsap = new SimpleJSAP( SelectStats.class.getName(), "Prints or selects parts of a stat file using global counts.",
      new Parameter[] {
        new Switch( "print", 'p', "print", "Just print global occurrences." ),
        new FlaggedOption( "globalFrequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "global-frequency", "The global count divided by the sum of document lengths that will be used to choose words to dump." ),
        new FlaggedOption( "quantumBitLength", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'q', "quantum-bit-length", "The quantum bit length that will be used to choose words to dump." ),
        new FlaggedOption( "error", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "error", "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump." ),
        new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The index basename." ),
        new UnflaggedOption( "statFile", JSAP.STRING_PARSER, JSAP.REQUIRED, "The stat file to be scanned." )
    });

    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    final boolean print = jsapResult.getBoolean( "print" );
    final String basename = jsapResult.getString( "basename" );
    final String statFile = jsapResult.getString( "statFile" );
    final int quantumBitLength = jsapResult.getInt( "quantumBitLength", 0 );
    final double globalFrequency = jsapResult.getDouble( "globalFrequency", 0 );
    final int error = jsapResult.getInt( "error", 1 );
    final double lowGlobFreq = globalFrequency * ( 1 - error / 100.0 );
    final double highGlobFreq = globalFrequency * ( 1 + error / 100.0 );
    final int lowQbl= (int)Math.round(quantumBitLength * ( 1 - error / 100.0 ));
    final int highQbl = (int)Math.round( quantumBitLength* ( 1 + error / 100.0 ) );

    final Properties properties = new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION );
    final int numberOfTerms = properties.getInt( Index.PropertyKeys.TERMS );
    final long numberOfoccurrences = properties.getLong( Index.PropertyKeys.OCCURRENCES );
   
    final InputBitStream globCounts = new InputBitStream( basename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
    long gc[] = new long[ numberOfTerms ];
    for( int t = 0; t < numberOfTerms; t++ ) gc[ t ] = globCounts.readLongGamma();
    globCounts.close();

    final MutableString line = new MutableString();
    MutableString number;
    final FastBufferedReader reader = new FastBufferedReader( new FileReader( statFile ) );
   
    boolean dumping = false;
    int f, q;
    reader.readLine( line );
    while( reader.readLine( line ) != null ) {
      if ( line.charAt( 0 ) == '#' ) {
        number = line.substring( 2 );
        f = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() );
        double freq = (double)gc[ f ] / numberOfoccurrences;
        if ( print ) System.out.println( line + " " + format( freq ) );
        else {
          if ( quantumBitLength != 0 ) {
            // We choose using the quantum bit length
            number = line.substring( 2 );
            number = number.substring( number.indexOf( ' ' ) + 1 );
            q = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() );
            dumping = q >= lowQbl && q <= highQbl;
          }
          else dumping = freq >= lowGlobFreq && freq <= highGlobFreq;
        }
        if ( dumping ) line.println( System.out );
       }
      else if ( ! print && dumping ) {
        line.println( System.out );
      }
    }
  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.test.SelectStats

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.