Package net.bpiwowar.mg4j.extensions.trec

Source Code of net.bpiwowar.mg4j.extensions.trec.TRECParsingFactory

package net.bpiwowar.mg4j.extensions.trec;

import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.parser.Entity;
import it.unimi.dsi.parser.ParsingFactory;

/**
* A parsing factory for TREC (AP/WSJ collections) documents
*/
public class TRECParsingFactory implements ParsingFactory {


    @Override
  public Element getElement(final MutableString name) {
    return NAME2ELEMENT.get(name.toString().toLowerCase());
  }

  @Override
  public Attribute getAttribute(final MutableString name) {
    return NAME2ATTRIBUTE.get(name.toString().toLowerCase());
  }

  @Override
  public Entity getEntity(final MutableString name) {
    return NAME2ENTITY.get(name.toString());
  }

  public static final TRECParsingFactory INSTANCE = new TRECParsingFactory();

  /** A (quick) map from entity names to entites. */
  static final Object2ObjectOpenHashMap<String, Entity> NAME2ENTITY = new Object2ObjectOpenHashMap<String, Entity>(
      Hash.DEFAULT_INITIAL_SIZE, .5f);

  /** A (quick) map from attribute names to attributes. */
  static final Object2ObjectOpenHashMap<String, Attribute> NAME2ATTRIBUTE = new Object2ObjectOpenHashMap<String, Attribute>(
      Hash.DEFAULT_INITIAL_SIZE, .5f);

  /** A (quick) map from element-type names to element types. */
  static final Object2ObjectOpenHashMap<String, Element> NAME2ELEMENT = new Object2ObjectOpenHashMap<String, Element>(
      Hash.DEFAULT_INITIAL_SIZE, .5f);

  static Element newElement(final String name) {
    final Element element = new Element(name);
    NAME2ELEMENT.put(element.name.toString(), element);
    return element;
  }

  static Element newElement(final String name, boolean breaksFlow,
      boolean isSimple) {
    final Element element = new Element(name, breaksFlow, isSimple);
    NAME2ELEMENT.put(element.name.toString(), element);
    return element;
  }

  static Entity newEntity(final String name, final char c) {
    final Entity entity = new Entity(name, c);
    NAME2ENTITY.put(entity.name.toString(), entity);
    return entity;
  }

  static Attribute newAttribute(final String name) {
    final Attribute attribute = new Attribute(name);
    NAME2ATTRIBUTE.put(attribute.name.toString(), attribute);
    return attribute;
  }

  /*
   * Attributes
   */
  public static final Attribute ATTR_NUMBER = newAttribute("number");
  public static final Attribute ATTR_TYPE = newAttribute("type");

  /*
   * Elements
   */
  public static final Element ELEMENT_NUM = newElement("num"),
      ELEMENT_TOP = newElement("top"),
      ELEMENT_TITLE = newElement("title"),
      ELEMENT_DESC = newElement("desc"),
     
      // Summary 
      ELEMENT_NARR = newElement("narr", true, true),
      ELEMENT_SMRY = newElement("smry"),
     
      // Definition in trec topic
      ELEMENT_DEF = newElement("def"),
     
      // Concept in trec topic
      ELEMENT_CON = newElement("con"),

      // javascript and friends
      ELEMENT_SCRIPT = newElement("script", true, true),


            ELEMENT_SUMMARY = newElement("summary", true, true),
            ELEMENT_HL = newElement("hl", true, true),
            ELEMENT_ABST = newElement("abst", true, true),
            ELEMENT_BODY = newElement("body", true, true),
            ELEMENT_TEXT = newElement("text", true, true),
      ELEMENT_P = newElement("p", true, true),
      ELEMENT_SECTION = newElement("section", true, true),
      ELEMENT_LEADPARA = newElement("leadpara", true, true),
      ELEMENT_DESCRIPT = newElement("descript", true, true),
      ELEMENT_FIRST = newElement("first", true, true),
      ELEMENT_SECOND = newElement("second", true, true),
      ELEMENT_PAGE = newElement("page", true, true),
      ELEMENT_MEMO = newElement("memo", true, true),
      ELEMENT_PUB = newElement("pub", true, true),
      ELEMENT_SLUG = newElement("slug", true, true),
      ELEMENT_HEADLINE = newElement("headline", true, true),
      ELEMENT_CENTER = newElement("center", true, true),
      ELEMENT_TTL = newElement("ttl", true, true),
      ELEMENT_DATE = newElement("date", true, true),
      ELEMENT_DATETIME = newElement("date_time", true, true),
      ELEMENT_PUBDATE = newElement("pubdate", true, true),
      ELEMENT_DOCNO = newElement("docno", true, true),
      ELEMENT_DOCID = newElement("docid", true, true),
      ELEMENT_DOCHDR = newElement("dochdr", true, true)
      ELEMENT_HEADER = newElement("header", true, true),
      ELEMENT_PROFILE = newElement("profile", true, true),
      ELEMENT_COPYRGHT = newElement("copyrght", true, true),
      ELEMENT_FILEID = newElement("fileid", true, true),
      ELEMENT_BYLINE = newElement("byline", true, true),

      // Elements used for contemporary Web track topics
      ELEMENT_TOPIC = newElement("topic"),
      ELEMENT_QUERY = newElement("query"),
      ELEMENT_DESCRIPTION = newElement("description"),
      ELEMENT_SUBTOPIC = newElement("subtopic");
 

  static {
    NAME2ATTRIBUTE.defaultReturnValue(Attribute.UNKNOWN);
    NAME2ELEMENT.defaultReturnValue(Element.UNKNOWN);

    // --- Entity Names -----------------------------------

    // Latin 1
    newEntity("nbsp", (char) 160);
    newEntity("iexcl", (char) 161);
    newEntity("cent", (char) 162);
    newEntity("pound", (char) 163);
    newEntity("curren", (char) 164);
    newEntity("yen", (char) 165);
    newEntity("brvbar", (char) 166);
    newEntity("sect", (char) 167);
    newEntity("uml", (char) 168);
    newEntity("copy", (char) 169);
    newEntity("ordf", (char) 170);
    newEntity("laquo", (char) 171);
    newEntity("not", (char) 172);
    newEntity("shy", (char) 173);
    newEntity("reg", (char) 174);
    newEntity("macr", (char) 175);
    newEntity("deg", (char) 176);
    newEntity("plusmn", (char) 177);
    newEntity("sup2", (char) 178);
    newEntity("sup3", (char) 179);
    newEntity("acute", (char) 180);
    newEntity("micro", (char) 181);
    newEntity("para", (char) 182);
    newEntity("middot", (char) 183);
    newEntity("cedil", (char) 184);
    newEntity("sup1", (char) 185);
    newEntity("ordm", (char) 186);
    newEntity("raquo", (char) 187);
    newEntity("frac14", (char) 188);
    newEntity("frac12", (char) 189);
    newEntity("frac34", (char) 190);
    newEntity("iquest", (char) 191);
    newEntity("Agrave", (char) 192);
    newEntity("Aacute", (char) 193);
    newEntity("Acirc", (char) 194);
    newEntity("Atilde", (char) 195);
    newEntity("Auml", (char) 196);
    newEntity("Aring", (char) 197);
    newEntity("AElig", (char) 198);
    newEntity("Ccedil", (char) 199);
    newEntity("Egrave", (char) 200);
    newEntity("Eacute", (char) 201);
    newEntity("Ecirc", (char) 202);
    newEntity("Euml", (char) 203);
    newEntity("Igrave", (char) 204);
    newEntity("Iacute", (char) 205);
    newEntity("Icirc", (char) 206);
    newEntity("Iuml", (char) 207);
    newEntity("ETH", (char) 208);
    newEntity("Ntilde", (char) 209);
    newEntity("Ograve", (char) 210);
    newEntity("Oacute", (char) 211);
    newEntity("Ocirc", (char) 212);
    newEntity("Otilde", (char) 213);
    newEntity("Ouml", (char) 214);
    newEntity("times", (char) 215);
    newEntity("Oslash", (char) 216);
    newEntity("Ugrave", (char) 217);
    newEntity("Uacute", (char) 218);
    newEntity("Ucirc", (char) 219);
    newEntity("Uuml", (char) 220);
    newEntity("Yacute", (char) 221);
    newEntity("THORN", (char) 222);
    newEntity("szlig", (char) 223);
    newEntity("agrave", (char) 224);
    newEntity("aacute", (char) 225);
    newEntity("acirc", (char) 226);
    newEntity("atilde", (char) 227);
    newEntity("auml", (char) 228);
    newEntity("aring", (char) 229);
    newEntity("aelig", (char) 230);
    newEntity("ccedil", (char) 231);
    newEntity("egrave", (char) 232);
    newEntity("eacute", (char) 233);
    newEntity("ecirc", (char) 234);
    newEntity("euml", (char) 235);
    newEntity("igrave", (char) 236);
    newEntity("iacute", (char) 237);
    newEntity("icirc", (char) 238);
    newEntity("iuml", (char) 239);
    newEntity("eth", (char) 240);
    newEntity("ntilde", (char) 241);
    newEntity("ograve", (char) 242);
    newEntity("oacute", (char) 243);
    newEntity("ocirc", (char) 244);
    newEntity("otilde", (char) 245);
    newEntity("ouml", (char) 246);
    newEntity("divide", (char) 247);
    newEntity("oslash", (char) 248);
    newEntity("ugrave", (char) 249);
    newEntity("uacute", (char) 250);
    newEntity("ucirc", (char) 251);
    newEntity("uuml", (char) 252);
    newEntity("yacute", (char) 253);
    newEntity("thorn", (char) 254);
    newEntity("yuml", (char) 255);

    // Special
    newEntity("quot", (char) 34);
    newEntity("apos", (char) 39);
    newEntity("amp", (char) 38);
    newEntity("lt", (char) 60);
    newEntity("gt", (char) 62);
    newEntity("OElig", (char) 338);
    newEntity("oelig", (char) 339);
    newEntity("Scaron", (char) 352);
    newEntity("scaron", (char) 353);
    newEntity("Yuml", (char) 376);
    newEntity("circ", (char) 710);
    newEntity("tilde", (char) 732);
    newEntity("ensp", (char) 8194);
    newEntity("emsp", (char) 8195);
    newEntity("thinsp", (char) 8201);
    newEntity("zwnj", (char) 8204);
    newEntity("zwj", (char) 8205);
    newEntity("lrm", (char) 8206);
    newEntity("rlm", (char) 8207);
    newEntity("ndash", (char) 8211);
    newEntity("mdash", (char) 8212);
    newEntity("lsquo", (char) 8216);
    newEntity("rsquo", (char) 8217);
    newEntity("sbquo", (char) 8218);
    newEntity("ldquo", (char) 8220);
    newEntity("rdquo", (char) 8221);
    newEntity("bdquo", (char) 8222);
    newEntity("dagger", (char) 8224);
    newEntity("Dagger", (char) 8225);
    newEntity("permil", (char) 8240);
    newEntity("lsaquo", (char) 8249);
    newEntity("rsaquo", (char) 8250);
    newEntity("euro", (char) 8364);

    // Symbols
    newEntity("fnof", (char) 402);
    newEntity("Alpha", (char) 913);
    newEntity("Beta", (char) 914);
    newEntity("Gamma", (char) 915);
    newEntity("Delta", (char) 916);
    newEntity("Epsilon", (char) 917);
    newEntity("Zeta", (char) 918);
    newEntity("Eta", (char) 919);
    newEntity("Theta", (char) 920);
    newEntity("Iota", (char) 921);
    newEntity("Kappa", (char) 922);
    newEntity("Lambda", (char) 923);
    newEntity("Mu", (char) 924);
    newEntity("Nu", (char) 925);
    newEntity("Xi", (char) 926);
    newEntity("Omicron", (char) 927);
    newEntity("Pi", (char) 928);
    newEntity("Rho", (char) 929);
    newEntity("Sigma", (char) 931);
    newEntity("Tau", (char) 932);
    newEntity("Upsilon", (char) 933);
    newEntity("Phi", (char) 934);
    newEntity("Chi", (char) 935);
    newEntity("Psi", (char) 936);
    newEntity("Omega", (char) 937);
    newEntity("alpha", (char) 945);
    newEntity("beta", (char) 946);
    newEntity("gamma", (char) 947);
    newEntity("delta", (char) 948);
    newEntity("epsilon", (char) 949);
    newEntity("zeta", (char) 950);
    newEntity("eta", (char) 951);
    newEntity("theta", (char) 952);
    newEntity("iota", (char) 953);
    newEntity("kappa", (char) 954);
    newEntity("lambda", (char) 955);
    newEntity("mu", (char) 956);
    newEntity("nu", (char) 957);
    newEntity("xi", (char) 958);
    newEntity("omicron", (char) 959);
    newEntity("pi", (char) 960);
    newEntity("rho", (char) 961);
    newEntity("sigmaf", (char) 962);
    newEntity("sigma", (char) 963);
    newEntity("tau", (char) 964);
    newEntity("upsilon", (char) 965);
    newEntity("phi", (char) 966);
    newEntity("chi", (char) 967);
    newEntity("psi", (char) 968);
    newEntity("omega", (char) 969);
    newEntity("thetasym", (char) 977);
    newEntity("upsih", (char) 978);
    newEntity("piv", (char) 982);
    newEntity("bull", (char) 8226);
    newEntity("hellip", (char) 8230);
    newEntity("prime", (char) 8242);
    newEntity("Prime", (char) 8243);
    newEntity("oline", (char) 8254);
    newEntity("frasl", (char) 8260);
    newEntity("weierp", (char) 8472);
    newEntity("image", (char) 8465);
    newEntity("real", (char) 8476);
    newEntity("trade", (char) 8482);
    newEntity("alefsym", (char) 8501);
    newEntity("larr", (char) 8592);
    newEntity("uarr", (char) 8593);
    newEntity("rarr", (char) 8594);
    newEntity("darr", (char) 8595);
    newEntity("harr", (char) 8596);
    newEntity("crarr", (char) 8629);
    newEntity("lArr", (char) 8656);
    newEntity("uArr", (char) 8657);
    newEntity("rArr", (char) 8658);
    newEntity("dArr", (char) 8659);
    newEntity("hArr", (char) 8660);
    newEntity("forall", (char) 8704);
    newEntity("part", (char) 8706);
    newEntity("exist", (char) 8707);
    newEntity("empty", (char) 8709);
    newEntity("nabla", (char) 8711);
    newEntity("isin", (char) 8712);
    newEntity("notin", (char) 8713);
    newEntity("ni", (char) 8715);
    newEntity("prod", (char) 8719);
    newEntity("sum", (char) 8721);
    newEntity("minus", (char) 8722);
    newEntity("lowast", (char) 8727);
    newEntity("radic", (char) 8730);
    newEntity("prop", (char) 8733);
    newEntity("infin", (char) 8734);
    newEntity("ang", (char) 8736);
    newEntity("and", (char) 8743);
    newEntity("or", (char) 8744);
    newEntity("cap", (char) 8745);
    newEntity("cup", (char) 8746);
    newEntity("int", (char) 8747);
    newEntity("there4", (char) 8756);
    newEntity("sim", (char) 8764);
    newEntity("cong", (char) 8773);
    newEntity("asymp", (char) 8776);
    newEntity("ne", (char) 8800);
    newEntity("equiv", (char) 8801);
    newEntity("le", (char) 8804);
    newEntity("ge", (char) 8805);
    newEntity("sub", (char) 8834);
    newEntity("sup", (char) 8835);
    newEntity("nsub", (char) 8836);
    newEntity("sube", (char) 8838);
    newEntity("supe", (char) 8839);
    newEntity("oplus", (char) 8853);
    newEntity("otimes", (char) 8855);
    newEntity("perp", (char) 8869);
    newEntity("sdot", (char) 8901);
    newEntity("lceil", (char) 8968);
    newEntity("rceil", (char) 8969);
    newEntity("lfloor", (char) 8970);
    newEntity("rfloor", (char) 8971);
    newEntity("lang", (char) 9001);
    newEntity("rang", (char) 9002);
    newEntity("loz", (char) 9674);
    newEntity("spades", (char) 9824);
    newEntity("clubs", (char) 9827);
    newEntity("hearts", (char) 9829);
    newEntity("diams", (char) 9830);
  }

}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.trec.TRECParsingFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.