Package edu.ucla.sspace.text

Source Code of edu.ucla.sspace.text.IteratorFactory

/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.text;

import edu.ucla.sspace.util.FileResourceFinder;
import edu.ucla.sspace.util.LimitedIterator;
import edu.ucla.sspace.util.ReflectionUtil;
import edu.ucla.sspace.util.ResourceFinder;

import java.io.BufferedReader;
import java.io.IOError;
import java.io.IOException;
import java.io.StringReader;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;


/**
* A factory class for generating {@code Iterator<String>} tokenizers for
* streams of tokens such as {@link BufferedReader} instances.  This class
* manages all of the internal configurations and properties for how to
* tokenize.  {@link edu.ucla.sspace.common.SemanticSpace SemanticSpace}
* instances are encouraged to utilize this class for creating iterators over
* the tokens in the documents rather than creating the iterators themsevles, as
* this class may contain additional settings to be applied to which the {@link
* edu.ucla.sspace.common.SemanticSpace SemanticSpace} instance would not have
* access.
*
* <p>
*
* This class offers two configurable parameters for controlling the tokenizing
* of streams.
*
* <dl style="margin-left: 1em">
*
* <dt> <i>Property:</i> <code><b>{@value #TOKEN_FILTER_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> <i>unset</i>
*
* <dd style="padding-top: .5em">This property sets a configuration of a {@link
*      TokenFilter} that should be applied to all token streams.<p>
*
* <dt> <i>Property:</i> <code><b>{@value #STEMMER_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> <i>unset</i>
*
* <dd style="padding-top: .5em">This property sets enables the use of the
*      {@link Stemmer} on all the tokens returned by iterators of this class.
*      The property value should be the fully qualified class name of a {@code
*      Stemmer} class implementation.<p>
*
* <dt> <i>Property:</i> <code><b>{@value #TOKEN_COUNT_LIMIT_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> <i>unset</i>
*
* <dd style="padding-top: .5em">This property sets the maximum number of tokens
*       returned by any iterator returned from this class.  It can be used to
*       artificially limit the total number of tokens per document.<p>
*
* <dt> <i>Property:</i> <code><b>{@value #COMPOUND_TOKENS_FILE_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> <i>unset</i>
*
* <dd style="padding-top: .5em">This property sets the name of a file that
*      Contains all of the recognized compound words (or multi-token tokens)
*      recognized by any iterators returned by this class.<p>
*
* </dl> <p>
*
* <p>
*
* Note that tokens will be combined into a compound token prior to filtering.
* Therefore if filtering is enabled, any compound token should also be
* permitted by the word filter.<p>
*
* Note that this class provides two distinct ways to access the token streams
* if filtering is enabled.  The {@link #tokenize(BufferedReader) tokenize}
* method will filter out any tokens without any indication.  This can
* significantly alter the original ordering of the token stream.  For
* applications where the original ordering needs to be preserved, the {@link
* #tokenizeOrdered(BufferedReader) tokenizeOrdered} method should be used
* instead.  This method will return the {@code IteratorFactor.EMTPY_TOKEN}
* value to indicate that a token has been removed.  This preserves the original
* token ordering without requiring applications to do the filtering themselves.
* Note that If filtering is disabled, the two methods will return the same
* tokens.<p>
*
* This class is thread-safe.
*
* @see WordIterator
* @see TokenFilter
* @see CompoundWordIterator
*/
public class IteratorFactory {

    /**
     * The signifier that stands in place of a token has been removed from an
     * iterator's token stream by means of a {@link TokenFilter}.  Tokens
     * returned by {@link #tokenizeOrdered(BufferedReader) tokenizeOrdered} may
     * be checked against this value to determine whether a token at that
     * position in the stream would have been returned but was removed.
     */
    public static final String EMPTY_TOKEN = "";

    /**
     * The prefix for naming publically accessible properties
     */
    private static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.text.TokenizerFactory";

    /**
     * Specifies the {@link TokenFilter} to apply to all iterators generated by
     * this factory
     */
    public static final String TOKEN_FILTER_PROPERTY =
        PROPERTY_PREFIX + ".tokenFilter";

    /**
     * Specifies the {@link Stemmer} to use on tokens.  If not set, no stemming
     * is done.
     */
    public static final String STEMMER_PROPERTY =
        PROPERTY_PREFIX + ".stemmer";

    /**
     * Specifies the name of a file that contains all the recognized compound
     * tokens
     */
    public static final String COMPOUND_TOKENS_FILE_PROPERTY =
        PROPERTY_PREFIX + ".compoundTokens";
   
    /**
     * Specifies the name of a file which contains term replacement mappings for
     * a {@code WordReplacementIterator}.
     */
    public static final String TOKEN_REPLACEMENT_FILE_PROPERTY =
        PROPERTY_PREFIX + ".replacementTokens";

    /**
     * Specifices an upper limit on the number of tokens each iterator can
     * return.
     */
    public static final String TOKEN_COUNT_LIMIT_PROPERTY =
        PROPERTY_PREFIX + ".tokenCountLimit";


    /**
     * A list of all the factory properties supported for configuration by the
     * {@link IteratorFactory}.
     */
    public static final Set<String> ITERATOR_FACTORY_PROPERTIES =
        new HashSet<String>();
   
    // Static block for setting the properties
    static {
        ITERATOR_FACTORY_PROPERTIES.add(
                IteratorFactory.TOKEN_FILTER_PROPERTY);
        ITERATOR_FACTORY_PROPERTIES.add(
                IteratorFactory.STEMMER_PROPERTY);
        ITERATOR_FACTORY_PROPERTIES.add(
                IteratorFactory.COMPOUND_TOKENS_FILE_PROPERTY);
        ITERATOR_FACTORY_PROPERTIES.add(
                IteratorFactory.TOKEN_REPLACEMENT_FILE_PROPERTY);
        ITERATOR_FACTORY_PROPERTIES.add(
                IteratorFactory.TOKEN_COUNT_LIMIT_PROPERTY);
    }

    /**
     * An optional {@code TokenFilter} to use to remove tokens from document
     */
    private static TokenFilter filter;

    /**
     * The {@link ResourceFinder} used to locate the file-based resources used
     * by the iterator factory.  The default value for this is to read things
     * directly from {@code File} instances.
     */
    private static ResourceFinder resourceFinder = new FileResourceFinder();
   
    /**
     * True if stemming should be done in a word iterator.
     */
    private static Stemmer stemmer;

    /**
     * The maximum number of tokens an iterator may return.
     */
    private static int wordLimit;

    /**
     * An optional {@code Map} used to replace terms returned by iterators.
     */
    private static Map<String, String> replacementMap;

    /**
     * A mapping from a thread that is currently processing tokens to the {@link
     * CompoundWordIterator} doing the tokenizing if compound word support is
     * enabled.  This mapping is required for two reasons.  One to reduce the
     * overhead of creating {@code CompoundWordIterators} by calling {@code
     * reset} on them; and two, to provide a way for any updates to the list of
     * compound words to propagate to the threads that process them.
     */
    private static final Map<Thread,CompoundWordIterator> compoundIterators =
        new HashMap<Thread,CompoundWordIterator>();

    /**
     * The set of compound tokens recognized by the system or {@code null} if
     * none are recognized
     */
    private static Set<String> compoundTokens = null;

    /**
     * Uninstantiable
     */
    private IteratorFactory() { }

    /**
     * Reconfigures the type of iterator returned by this factory based on the
     * specified properties.
     */
    public static synchronized void setProperties(Properties props) {
        wordLimit = Integer.parseInt(
                props.getProperty(TOKEN_COUNT_LIMIT_PROPERTY, "0"));

        String filterProp =
            props.getProperty(TOKEN_FILTER_PROPERTY);
        filter = (filterProp != null)
            ? TokenFilter.loadFromSpecification(filterProp, resourceFinder)
            : null;
       
        // NOTE: future implementations may interpret the value of this property
        // to decide which stemmer to use
        String stemmerProp = props.getProperty(STEMMER_PROPERTY);
        if (stemmerProp != null)
            stemmer = ReflectionUtil.<Stemmer>getObjectInstance(stemmerProp);

        String compoundTokensProp =
            props.getProperty(COMPOUND_TOKENS_FILE_PROPERTY);
        if (compoundTokensProp != null) {
            // Load the tokens from file
            compoundTokens = new LinkedHashSet<String>();
            try {
                BufferedReader br = resourceFinder.open(compoundTokensProp);
                for (String line = null; (line = br.readLine()) != null; ) {
                    compoundTokens.add(line);
                }
                // For any currently processing threads, update their mapped
                // iterator with the new set of tokens
                for (Map.Entry<Thread,CompoundWordIterator> e
                     : compoundIterators.entrySet()) {
                    // Create an empy dummy BufferedReader, which will be
                    // discarded upon the next .reset() call to the iterator
                    BufferedReader dummyBuffer =
                        new BufferedReader(new StringReader(""));
                    e.setValue(new CompoundWordIterator(
                                dummyBuffer, compoundTokens));
                }
            } catch (IOException ioe) {
                // rethrow
                throw new IOError(ioe);
            }
        } else {
            // If the user did not specify a set of compound tokens, null out
            // the set, in the event that there was one previously
            compoundTokens = null;
        }

        String replacementProp =
            props.getProperty(TOKEN_REPLACEMENT_FILE_PROPERTY);
        if (replacementProp != null) {
            try {
                BufferedReader br = resourceFinder.open(replacementProp);
                replacementMap = new HashMap<String, String>();
                String line = null;
                while ((line = br.readLine()) != null) {
                    String[] termReplacement = line.split("\\s+");
                    replacementMap.put(termReplacement[0], termReplacement[1]);
                }
            } catch (IOException ioe) {
                throw new IOError(ioe);
            }
        } else
            replacementMap = null;
    }

    /**
     * Sets the {@link ResourceFinder} used by the iterator factory to locate
     * its file-based resources when configuring the tokenization.  This method
     * should be set prior to calling {@link #setProperties(Properties)
     * setProperties} to ensure that the resources are accessed correctly.  Most
     * applications will never need to call this method.
     *
     * @param finder the resource finder used to find and open file-based
     *        resources
     */
    public static void setResourceFinder(ResourceFinder finder) {
        resourceFinder = finder;
    }

    /**
     * Tokenizes the contents of the reader according to the system
     * configuration and returns an iterator over all the tokens, excluding
     * those that were removed by any configured {@link TokenFilter}.
     *
     * @param reader a reader whose contents are to be tokenized
     *
     * @return an iterator over all of the optionally-filtered tokens in the
     *         reader
     */
    public static Iterator<String> tokenize(BufferedReader reader) {
        return getBaseIterator(reader, false);
    }

    /**
     * Tokenizes the contents of the string according to the system
     * configuration and returns an iterator over all the tokens, excluding
     * those that were removed by any configured {@link TokenFilter}.
     *
     * @param str a string whose contents are to be tokenized
     *
     * @return an iterator over all of the optionally-filtered tokens in the
     *         string
     */
    public static Iterator<String> tokenize(String str) {
        return tokenize(new BufferedReader(new StringReader(str)));
    }

    /**
     * Tokenizes the contents of the reader according to the system
     * configuration and returns an iterator over all the tokens where any
     * removed tokens have been replaced with the {@code
     * IteratorFactory.EMPTY_TOKEN} value.  Tokens returned by this method may
     * be checked against this value to determine whether a token at that
     * position in the stream would have been returned but was removed.  In
     * doing this, the original order and positioning is retained.
     *
     * @param reader a reader whose contents are to be tokenized
     *
     * @return an iterator over all of the tokens in the reader where any tokens
     *         removed due to filtering have been replaced with the {@code
     *         IteratorFactory.EMPTY_TOKEN} value
     */
    public static Iterator<String> tokenizeOrdered(BufferedReader reader) {
        return getBaseIterator(reader, true);
    }

    /**
     * Tokenizes the contents of the string according to the system
     * configuration and returns an iterator over all the tokens where any
     * removed tokens have been replaced with the {@code
     * IteratorFactory.EMPTY_TOKEN} value.  Tokens returned by this method may
     * be checked against this value to determine whether a token at that
     * position in the stream would have been returned but was removed.  In
     * doing this, the original order and positioning is retained.
     *
     * @param str a string whose contents are to be tokenized
     *
     * @return an iterator over all of the tokens in the string where any tokens
     *         removed due to filtering have been replaced with the {@code
     *         IteratorFactory.EMPTY_TOKEN} value
     */
    public static Iterator<String> tokenizeOrdered(String str) {
        return tokenizeOrdered(new BufferedReader(new StringReader(str)));
    }

    /**
     * Wraps an iterator returned by {@link #tokenizeOrdered(String)
     * tokenizeOrdered} to also include term replacement of tokens.  Terms will
     * be replaced based on a mapping provided through the system configuration.
     *
     * @param reader A reader whose contents are to be tokenized.
     *
     * @return An iterator over all the tokens in the reader where any tokens
     *         removed due to filtering have been replaced with the {@code
     *         IteratorFactory.EMPTY_TOKEN} value, and tokens may be replaced
     *         based on system configuration.
     */
    public static Iterator<String> tokenizeOrderedWithReplacement(
            BufferedReader reader) {
        Iterator<String> baseIterator = tokenizeOrdered(reader);
        return (replacementMap == null)
            ? baseIterator
            : new WordReplacementIterator(baseIterator, replacementMap);
    }

    /**
     * Returns an iterator for the basic tokenization of the stream before
     * filtering has been applied to the tokens.
     *
     * @param reader a reader whose contents are to be tokenized
     *
     * @return an iterator over the tokens in the stream
     */
    private static Iterator<String> getBaseIterator(BufferedReader reader,
                                                    boolean keepOrdering) {

        // The final iterator is how the stream will be tokenized after all the
        // tokenizing options have been applied.  This value is iteratively set
        // as the options are applied
        Iterator<String> finalIterator = new WordIterator(reader);

        // STEP 1: APPLY TOKEN REPLACEMENT
        if (replacementMap != null)
            finalIterator =
                new WordReplacementIterator(finalIterator, replacementMap);

        // STEP 2: APPLY COMPOUND TOKENIZING
        if (compoundTokens != null) {
            // Because the initialization step for a CWI has some overhead, use
            // the reset to keep the same tokens.  However, multiple threads may
            // be each using their own CWI, so keep Thread-local storage of what
            // CWI is being used to avoid resetting another thread's iterator.
            CompoundWordIterator cwi =
                compoundIterators.get(Thread.currentThread());
            if (cwi == null) {
                cwi = new CompoundWordIterator(finalIterator, compoundTokens);
                compoundIterators.put(Thread.currentThread(), cwi);
            } else {
                // NOTE: if the underlying set of valid compound words is ever
                // changed, the iterator returned from the compoundIterators map
                // will have been updated by the setProperties() call, so this
                // method is guaranteed to pick up the latest set of compound
                // words
                cwi.reset(finalIterator);
            }
            finalIterator = cwi;
        }

        // STEP 3: APPLY TOKEN LIMITING
        if (wordLimit > 0)
            finalIterator = new LimitedIterator<String>(
                    finalIterator, wordLimit);

        // STEP 4: APPLY TOKEN FILTERING
        if (filter != null) {
            finalIterator = (keepOrdering)
                ? new OrderPreservingFilteredIterator(finalIterator, filter)
                : new FilteredIterator(finalIterator, filter);
        }

        // STEP 5: APPLY STEMMING
        if (stemmer != null)
            finalIterator = new StemmingIterator(finalIterator, stemmer);

        return finalIterator;
    }
}
TOP

Related Classes of edu.ucla.sspace.text.IteratorFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.