Package net.bpiwowar.mg4j.extensions.utils

Source Code of net.bpiwowar.mg4j.extensions.utils.StructuredTextExtractor

/**
*
*/
package net.bpiwowar.mg4j.extensions.utils;

import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.parser.callback.Callback;
import net.bpiwowar.mg4j.extensions.TagPointer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.util.*;

/**
* A callback extracting text and titles for structured documents. This implementation
* also keeps track of the different segments within the text. It
* also stores the positions of elements and tags.
* <p>
* This callbacks extracts all text in the page, and the title. The resulting
* text is available through {@link #text}, and the title through {@link #title}
* . Furthermore, the segments of the resulting text are preserved.
* </p>
*
* @author Ingo Frommholz &lt;ingo@dcs.gla.ac.uk&gt
* @author Benjamin Piwowarski <benjamin@bpiwowar.net>
*/

public class StructuredTextExtractor implements Callback {
    private static final Log LOG = LogFactory.getLog(StructuredTextExtractor.class);

    /**
     * The text resulting from the parsing process.
     */
    public final MutableString text = new MutableString();
    /**
     * The title resulting from the parsing process.
     */
    public final MutableString title = new MutableString();

    /**
     * Tag pointers
     */
    private List<TagPointer> pointers = new ArrayList<TagPointer>();

    /**
     * Elements to ignore
     */
    private Set<Element> ignoredElements = new HashSet<>();

    /**
     * The current position in the stream
     */
    private int currentPointer = 0;

    /**
     * Counter that tells us how many elements to ignore are containing the current position
     */
    private int ignoringDepth = 0;

    public StructuredTextExtractor() {
    }

    /**
     * Ignores a type of element
     *
     * @param element The element to ignore
     */
    public void ignore(Element... element) {
        ignoredElements.addAll(Arrays.asList(element));
    }

    /**
     * Returns the tag pointers
     *
     * @return Iterator over tag pointers
     */
    public Iterator<TagPointer> tagPointer() {
        return pointers.iterator();
    }

    @Override
    public void startDocument() {
        LOG.trace("Starting a new document");
        text.length(0);
        currentPointer = 0;
        pointers = new ArrayList<TagPointer>();
        ignoringDepth = 0;

    }

    public List<TagPointer> getPointers() {
        return pointers;
    }


    @Override
    public boolean characters(final char[] characters, final int offset,
                              final int length, final boolean flowBroken) {
        if (ignoringDepth == 0) {
            text.append(characters, offset, length);
            currentPointer += length;
        }
        return true;
    }

    @Override
    public boolean endElement(Element element) {
        if (!ignoredElements.contains(element)) {
            pointers.add(new TagPointer(currentPointer, element, TagPointer.TagType.END));

            // put blank at element position if the element breaks the flow
            if (element.breaksFlow) {
                text.append(' ');
                currentPointer++;
            }
        } else if (--ignoringDepth < 0) ignoringDepth = 0;

        return true;
    }

    @Override
    public boolean startElement(Element element,
                                Map<Attribute, MutableString> attrMapUnused) {

        if (!ignoredElements.contains(element)) {
            pointers.add(new TagPointer(currentPointer, element,
                    TagPointer.TagType.START));

            // put blank at element position if the element breaks the flow
            if (element.breaksFlow) {
                text.append(' ');
                currentPointer++;
            }
        } else ++ignoringDepth;
        return true;
    }

    /**
     * Returns the text. Note that the text may not be trimmed.
     *
     * @return the text
     */
    public MutableString getText() {
        return this.text;
    }

    @Override
    public boolean cdata(Element element, char[] text, int offset, int length) {
        return true;
    }

    @Override
    public void endDocument() {

    }

    @Override
    public void configure(final BulletParser parser) {
        parser.parseText(true);
        parser.parseTags(true);
        parser.parseCDATA(false);
        parser.parseAttributes(false);
    }

}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.utils.StructuredTextExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.