Package edu.umd.cloud9.webgraph.data

Source Code of edu.umd.cloud9.webgraph.data.AnchorText

/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package edu.umd.cloud9.webgraph.data;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.WritableComparable;

import bak.pcj.IntIterator;
import bak.pcj.set.IntOpenHashSet;


/**
* <p>
* This data structure represents a line of anchor text. A line of anchor text has
* some text, a weight, and a set of sources (targets) associated with it.
* Sources (targets) are the pages a line of anchor text originates
* from (points to) when the underlying link is an incoming (outgoing) link.
* </p>
*
* <p>
* The implemented iterator makes it possible to iterate through the source or target
* documents for each line of anchor text.
* </p>
*
* @author Nima Asadi
*
*/


public class AnchorText implements WritableComparable<AnchorText>,
                                   AnchorTextConstants, Iterable<Integer> {
  //AnchorText objects can be incoming or outgoing,
  //external or internal, etc. depending on the type
  private byte type;

  //holds the text for a line of anchor text
  private String text;

  //sources (or targets in case the underlying link is an outgoing one)
  private IntOpenHashSet documentList;

  //weight for a line of anchor text, if defined
  private float weight;

  /**
   * Creates an empty Internal Incoming Link AnchorText object
   */
  public AnchorText() {
    documentList = new IntOpenHashSet();
    resetToType(Type.INTERNAL_IN_LINK.val);
  }

  /**
   * Creates a new AnchorText object
   *
   * @param type Internal or external, incoming or outgoing, etc.
   *        (see {@link: AnchorTextConstants})
   * @param text Text associated with a line of anchor text
   */
  public AnchorText(byte type, String text) {
    this();
    resetToType(type);
    setText(text);
  }

  /**
   * Creates a new AnchorText object and adds a new source/target document if
   * the AnchorText object is allowed to have text.
   * (see {@link AnchorTextConstants interface}
   *
   * @param type Internal or external, incoming or outgoing, etc.
   *        (see {@link AnchorTextConstants}
   * @param text Text associated with a line of anchor text
   * @param docno Source/Target document id
   */
  public AnchorText(byte type, String text, int docno) {
    this();
    resetToType(type);
    setText(text);

    if(hasValidDocumentList()) {
      addDocument(docno);
    }
  }

  /**
   * Deserializes an AnchorText object.
   *
   * @param in Input Stream
   */
  public void readFields(DataInput in) throws IOException {
    this.type = in.readByte();
    resetToType(this.type);

    if(hasValidText()) {
      this.text = in.readUTF();
    }

    if(hasValidDocumentList()) {
      int size = in.readInt();
      for(int i = 0; i < size; i++) {
        this.documentList.add(in.readInt());
      }
    }

    if(hasValidWeight()) {
      this.weight = in.readFloat();
    }
  }

  /**
   * Serializes an AnchorText object
   *
   * @param out
   *         Output Stream
   */
  public void write(DataOutput out) throws IOException {
    out.writeByte(type);

    if(hasValidText()) {
      out.writeUTF(text);
    }

    if(hasValidDocumentList()) {
      out.writeInt(documentList.size());
      IntIterator iterator = documentList.iterator();
      while(iterator.hasNext()) {
        out.writeInt(iterator.next());
      }
    }

    if(hasValidWeight()) {
      out.writeFloat(weight);
    }
  }

  /**
   * @return the type of this AnchorText
   */
  public byte getType() {
    return type;
  }

  /**
   * Clears this object and initializes the type
   *
   * @param type
   *         New type
   */
  public void resetToType(byte type) {
    this.type = type;

    if(hasValidText()) {
      this.text = EMPTY_STRING;
    } else {
      this.text = null;
    }

    weight = 0;
    documentList.clear();
  }

  /**
   * @return the text of this line of anchor text.
   *         Null if the AnchorText object does not
   *         have any text associated with it.
   */
  public String getText() {
    return text;
  }

  /**
   * Sets the text for this line of anchor text
   * @param text New text for this anchor text
   */
  public void setText(String text) {
    if(hasValidText()) {
      this.text = text;
    }
  }

  /**
   * @return the weight for this anchor text
   */
  public float getWeight() {
    return weight;
  }

  /**
   * Sets a new weight for this line of anchor text and changes the type to "weighted"
   *
   * @param weight New weight
   */
  public void setWeight(float weight) {
    if(isExternalInLink()) {
      this.weight = weight;
      this.type = Type.WEIGHTED_EXTERNAL_IN_LINK.val;
    }
  }

  /**
   * @return the cardinality of the set of sources/targets
   */
  public int getSize() {
    return documentList.size();
  }

  /**
   * Returns a list of all the sources/targets
   *
   * @return An array of ints that contains all the
   *         sources/targets of the current object
   */
  public int[] getDocuments() {
    return documentList.toArray();
  }

  /**
   * Adds a new source/target to this anchor text.
   *
   * @param docno The new document id to be added to the source/target list
   */
  public void addDocument(int docno) {
    if(!hasValidDocumentList()) {
      return;
    }
    documentList.add(docno);
  }

  /**
   * Adds the sources/targets from another AnchorText to the current object
   *
   * @param other The other AnchorText object from which the
   *        sources/targets are to be copied.
   */
  public void addDocumentsFrom(AnchorText other) {
    if(!hasValidDocumentList()) {
      return;
    }

    IntIterator iterator = other.documentList.iterator();
    while(iterator.hasNext()) {
      addDocument(iterator.next());
    }
  }

  /**
   * Checks whether a document is a source or a target for this anchor text
   *
   * @param docno Document to be checked
   * @return True if the document is a source/target of this anchor text
   */
  public boolean containsDocument(int docno) {
    if(!hasValidDocumentList()) {
      return false;
    }
    return documentList.contains(docno);
  }

  /**
   * Checks whether two lines of anchor text share a source/target document
   *
   * @param other The other anchor text
   * @return True if this line of anchor text has a common source/target
   *         with the other line of anchor text.
   */
  public boolean intersects(AnchorText other) {
    if(!hasValidDocumentList() || !other.hasValidDocumentList()) {
      return false;
    }
    if(getSize() < other.getSize()) {
      IntIterator iterator = documentList.iterator();
      while(iterator.hasNext()) {
        if(other.containsDocument(iterator.next())) {
          return true;
        }
      }
    } else {
      IntIterator iterator = other.documentList.iterator();
      while(iterator.hasNext()) {
        if(documentList.contains(iterator.next())) {
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Checks whether two lines of anchor text are equal,
   * regardless of their source/target lists.
   *
   * @param other The other anchor text to check against.
   * @return True if the text fields, weights, and types of two
   *         AnchorText objects are equal, regardless
   *         of their source/targets lists.
   */
  public boolean equalsIgnoreSources(AnchorText other) {
    if(type != other.getType()) {
      return false;
    }
    if(hasValidWeight() && weight != other.getWeight()) {
      return false;
    }
    if(hasValidText()) {
      return text.equals(other.getText());
    }
    return true;
  }

  /**
   * Does a thorough comparison of two AnchorText objects.
   */
  public boolean equals(Object obj) {
    AnchorText other = (AnchorText) obj;

    if(hasValidDocumentList() && other.hasValidDocumentList()) {
      if(documentList.size() != other.documentList.size()) {
        return false;
      }
      IntIterator iterator = documentList.iterator();
      while(iterator.hasNext()) {
        if(!other.containsDocument(iterator.next())) {
          return false;
        }
      }
    }
    return equalsIgnoreSources(other);
  }

  /**
   * For sorting purposes, the comparison is only
   * limited to the type and the text
   * of two AnchorText objects. To check complete
   * equality of the objects use the equals method.
   */
  public int compareTo(AnchorText obj) {
    byte fl = obj.getType();
    String text = obj.getText();

    if(type != fl) {
      return type < fl ? -1 : 1;
    }
    if(hasValidText()) {
      return this.text.compareTo(text);
    }
    return 0;
  }

  public int hashCode() {
    if(hasValidText()) {
      return text.hashCode() + type;
    }
    return type;
  }

  public String toString() {
    StringBuilder builder = new StringBuilder();
    builder.append("(");

    if(isExternalInLink()) {
      builder.append("ExternalInLink");
    } else if(isInternalInLink()) {
      builder.append("InternalInLink");
    } else if(isExternalOutLink()) {
      builder.append("ExternalOutLink");
    } else if(isInternalOutLink()) {
      builder.append("InternalOutLink");
    } else if(isDocnoField()) {
      builder.append("Docno");
    } else if(isURL()) {
      builder.append("URL");
    } else if(isInDegree()) {
      builder.append("Indegree");
    } else if(isOutDegree()) {
      builder.append("Outdegree");
    } else {
      builder.append("OtherType");
    }

    if(hasValidText()) {
      builder.append(", " + text);
    }
    if(hasValidDocumentList()) {
      builder.append(", " + documentList.toString());
    }
    if(hasValidWeight()) {
      builder.append(", w:" + weight);
    }
    builder.append(")");
    return builder.toString();
  }

  /**
   * Clones (deep copies) this object and returns a new AnchorText object.
   *
   *  @return A new AnchorText object that is a copy of the current object.
   */
  public AnchorText clone() {
    AnchorText cloned = new AnchorText();
    cloned.resetToType(type);
    cloned.setText(text);

    if(hasValidDocumentList()) {
      cloned.addDocumentsFrom(this);
    }
    if(hasValidWeight()) {
      cloned.weight = weight;
    }
    return cloned;
  }

  /**
   * @return True if the current object is any type of
   * external incoming link (i.e., Weighted, non-weighted, etc.).
   */
  public boolean isExternalInLink() {
    return type == Type.EXTERNAL_IN_LINK.val ||
      type == Type.WEIGHTED_EXTERNAL_IN_LINK.val;
  }

  /**
   * @return True if the current object is any type of internal incoming link.
   */
  public boolean isInternalInLink() {
    return type == Type.INTERNAL_IN_LINK.val;
  }

  /**
   * @return True if the current object is any type of external outgoing link.
   */
  public boolean isExternalOutLink() {
    return type == Type.EXTERNAL_OUT_LINK.val;
  }

  /**
   * @return True if the current object is any type of internal outgoing link.
   */
  public boolean isInternalOutLink() {
    return type == Type.INTERNAL_OUT_LINK.val;
  }

  /**
   * @return True if the current line of anchor text has a weight associated with it.
   */
  public boolean isWeighted() {
    return type == Type.WEIGHTED_EXTERNAL_IN_LINK.val;
  }

  /**
   * @return True if the current object is a special type of
   * AnchorText (i.e., is not a real line of anchor text) - InDegree
   */
  public boolean isInDegree() {
    return type == Type.IN_DEGREE.val;
  }

  /**
   * @return True if the current object is a special type of
   * AnchorText (i.e., is not a real line of anchor text) - OutDegree
   */
  public boolean isOutDegree() {
    return type == Type.OUT_DEGREE.val;
  }

  /**
   * @return True if the current object is a special type of
   * AnchorText (i.e., is not a real line of anchor text) - Holds document number
   */
  public boolean isDocnoField() {
    return type == Type.DOCNO_FIELD.val;
  }

  /**
   * @return True if the current object is a special type of
   * AnchorText (i.e., is not a real line of anchor text) - Holds a URL address
   */
  public boolean isURL() {
    return type == Type.URL_FIELD.val;
  }

  /**
   * @return True if the current object is a special type of
   * AnchorText (i.e., is not a real line of anchor text) - Any other types of AnchorText
   */
  public boolean isOfOtherTypes() {
    return type == Type.OTHER_TYPES.val;
  }

  //checks whether the anchor text object has a valid text field.
  public boolean hasValidText() {
    return (type == Type.EXTERNAL_IN_LINK.val ||
            type == Type.INTERNAL_IN_LINK.val ||
            type == Type.URL_FIELD.val ||
            type == Type.OTHER_TYPES.val ||
            type == Type.WEIGHTED_EXTERNAL_IN_LINK.val);
  }

  //checks whether the current object has a valid list of sources/targets.
  private boolean hasValidDocumentList() {
    return type != Type.URL_FIELD.val;
  }

  //checks wether the current object has a valid weight associated with it.
  private boolean hasValidWeight() {
    return type == Type.WEIGHTED_EXTERNAL_IN_LINK.val;
  }

  /**
   * Creates a new iterator for the current object. The iterator can be used to iterate through
   * the list of sources/targets associated with the current line of anchor text.
   *
   * @return A new iterator object.
   */
  public Iterator<Integer> iterator() {
    return new Iterator<Integer>() {
      IntIterator iterator = documentList.iterator();

      public boolean hasNext() {
        return iterator.hasNext();
      }

      public Integer next() {
        return iterator.next();
      }

      public void remove() {
        iterator.remove();
      }
    };
  }
}
TOP

Related Classes of edu.umd.cloud9.webgraph.data.AnchorText

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.