Package it.unimi.dsi.mg4j.util.parser.callback

Source Code of it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor$Anchor

package it.unimi.dsi.mg4j.util.parser.callback;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2006-2010 Paolo Boldi
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/

import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.parser.callback.DefaultCallback;
import it.unimi.dsi.util.CircularCharArrayBuffer;

import java.util.Map;

import org.apache.log4j.Logger;

/** A callback extracting anchor text. When instantiating the extractor, you can specify the number of characters to
* be considered before the anchor, after the anchor or during the anchor (just the first characters are taken into
* consideration in the last two characters, and just the last ones in the first case).
*
* <p>At the end of parsing, the result (the list of anchors) is available in {@link #anchors}, whose
* elements provide the content of the <samp>href</samp> attribute
* the text of the anchor and around the anchor; text is however modified so that fragment of words at the beginning
* of the pre-anchor context, or at the end of the post-anchor context, are cut away.
*
* <p>For example, a fragment like:
*
* <code>
*    ...foo fOO FOO FOO <a href="xxx">ANCHOR TEXT</a> BAR BAR BAr bar...
* </code>
*
* (where the uppercase part represents the pre- and post-anchor context) generates the element
*
* <code>
*     Anchor("xxx", "FOO FOO ANCHOR TEXT BAR BAR")
* </code>
*/

public class AnchorExtractor extends DefaultCallback {

  /** A class representing an anchor. It is used to return the results of parsing.
   *
   */
  public final static class Anchor implements VirtualDocumentFragment {
    private static final long serialVersionUID = 1L;
    /** The content of the <samp>href</samp> attribute for this anchor. */
    private final MutableString href;
    /** The text surrounding this anchor. */
    private final MutableString anchorText;
   
    public Anchor( final MutableString href, final MutableString anchorText ) {
      this.href = href;
      this.anchorText = anchorText;
    }

    public MutableString documentSpecifier() {
      return href;
    }

    public MutableString text() {
      return anchorText;
    }
   
    public String toString() {
      return "<" + href + ", \"" + anchorText + "\">";
    }
  }
 
  public static final Logger LOGGER = Logger.getLogger( AnchorExtractor.class );
  public static final boolean DEBUG = false;
 
  /** The resulting list of {@linkplain Anchor anchors}. */
  public final ObjectList<Anchor> anchors = new ObjectArrayList<Anchor>();

  /** The circular buffer for pre-anchor context. */
  private final CircularCharArrayBuffer preAnchor;
  /** The circular buffer for anchor. */
  private final MutableString anchor;
  /** The maximum number of characters in the anchor. */
  private final int maxAnchor;
  /** The maximum number of characters after anchor. */
  private final int maxAfter;
  /** The post-anchor. */
  private final MutableString postAnchor;
  /** The current URL (if state is IN_ANCHOR). */
  private MutableString url;
  /** The resulting string (pre+anchor+post). */
  private MutableString result;
  /** When an anchor opens, the pre-anchor buffer is copied in this array. */
  private char[] preAnchorArray;

  private enum State {
    BEFORE_ANCHOR, IN_ANCHOR, AFTER_ANCHOR
  };
  private State state;
 
  /**
   *
   * @param maxBefore maximum number of words to be considered before of the anchor.
   * @param maxAfter maximum number of words to be considered after the anchor.
   */
  public AnchorExtractor( int maxBefore, int maxAnchor, int maxAfter ) {
    preAnchor = new CircularCharArrayBuffer( maxBefore );
    anchor = new MutableString( maxAnchor );
    postAnchor = new MutableString( maxAfter );
    result = new MutableString( maxBefore + maxAnchor + maxAfter );
    this.maxAfter = maxAfter;
    this.maxAnchor = maxAnchor;
    state = State.BEFORE_ANCHOR;
  }

  public void configure( final BulletParser parser ) {
    parser.parseTags( true );
    parser.parseAttributes( true );
    parser.parseText( true );
    parser.parseAttribute( Attribute.HREF );
  }

  public void startDocument() {
    state = State.BEFORE_ANCHOR;
    anchors.clear();
    preAnchor.clear();
    anchor.setLength( 0 );
    postAnchor.setLength( 0 );
    url = null;
  }
 
  public void endDocument() {
    if ( url != null ) {
      emit();
    }
    url = null;
  }
 
  public boolean startElement( final Element element, final Map<Attribute,MutableString> attrMap ) {   
    if ( element == Element.A && attrMap != null && attrMap.containsKey( Attribute.HREF ) ) {
      if ( state == State.AFTER_ANCHOR ) {
        emit();
        state = State.BEFORE_ANCHOR;
      }
      if ( state == State.BEFORE_ANCHOR ) {
        preAnchorArray = preAnchor.toCharArray();
        preAnchor.clear();
        if ( DEBUG ) System.out.println( "Freezing now pre: <" + new String( preAnchorArray ) + ">" );
        state = State.IN_ANCHOR;
        url = attrMap.get( Attribute.HREF );
        anchor.setLength( 0 );
        postAnchor.setLength( 0 );
      }
    }
    return true;
  }
 
  public boolean endElement( final Element element ) {
    if ( element == Element.A && state == State.IN_ANCHOR ) {
      state = State.AFTER_ANCHOR;
    }
    return true;
  }
 
  public boolean characters( final char[] characters, final int offset, final int length, final boolean flowBroken ) {
    switch ( state ) {
      case BEFORE_ANCHOR:
        preAnchor.add( characters, offset, length );
        break;
      case IN_ANCHOR:
        anchor.append( characters, offset, Math.min( length, maxAnchor - anchor.length() ) );
        break;
      case AFTER_ANCHOR:
        preAnchor.add( characters, offset, length );
        postAnchor.append( characters, offset, Math.min( length, maxAfter - postAnchor.length() ) );
        break;
    }
    if ( state == State.AFTER_ANCHOR && postAnchor.length() == maxAfter && url != null ) {
      emit();
      state = State.BEFORE_ANCHOR;
    }
    return true;
  }


  private void emit() {
    int posPre, posPost, posAnchor;
   
    // Cut pre until the first start of word
    posPre = 0;
    if ( preAnchorArray.length > 0 && Character.isLetterOrDigit( preAnchorArray[ posPre ] ) )
      // Skip starting non-space
      for ( ; posPre < preAnchorArray.length && Character.isLetterOrDigit( preAnchorArray[ posPre ] ); posPre++ );
    // Same for post
    char[] postAnchorArray = postAnchor.array();
    posPost = postAnchor.length() - 1;
    if ( posPost >= 0 && Character.isLetterOrDigit( postAnchorArray[ posPost ] ) ) {
      // Skip ending non-space
      for ( ; posPost >= 0 && Character.isLetterOrDigit( postAnchorArray[ posPost ] ); posPost-- );
    }
    // Same for anchor
    char[] anchorArray = anchor.array();
    posAnchor = anchor.length() - 1;
    if ( anchor.length() == maxAnchor && posAnchor >= 0 && Character.isLetterOrDigit( anchorArray[ posAnchor ] ) )
      // Skip starting non-space
      for ( ; posAnchor >= 0 && Character.isLetterOrDigit( anchorArray[ posAnchor ] ); posAnchor-- );
     
    result.setLength( 0 );
    result.append( preAnchorArray, posPre, preAnchorArray.length - posPre ).append( anchorArray, 0, posAnchor + 1 ).append( postAnchorArray, 0, posPost + 1 );
    anchors.add( new Anchor( url, result.copy() ) );
    url = null;
  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor$Anchor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.