Examples of cc.mallet.extract.StringTokenization

cc.mallet.extract.StringTokenization


      spans[i] = span;


    }


    StringTokenization tokenization = new StringTokenization (buf);
    tokenization.addAll (spans);
    carrier.setData (tokenization);


    carrier.setTarget (new LabelsAssignment (new LabelsSequence (lbls)));
    return carrier;
  }

View Full Code Here

    training.addThruPipe (new LineGroupIterator (new StringReader (labelsAtEndData), Pattern.compile ("^$"), true));


    assertEquals (1, training.size ());


    Instance inst1 = training.get (0);
    StringTokenization toks = (StringTokenization) inst1.getData ();
    LabelsSequence ls1 = (LabelsSequence) inst1.getTarget ();


    assertEquals (4, ls1.size ());
    assertEquals (3, toks.get(0).getFeatures ().size ());
    assertEquals ("LBLB LBLD", ls1.getLabels (0).toString ());


    LabelAlphabet globalDict = p.getLabelAlphabet (0);
    assertEquals (2, p.numLevels ());
    assertEquals (globalDict, ls1.getLabels (0).get (0).getLabelAlphabet ());

View Full Code Here

  }


  public Instance pipe (Instance carrier)
  {
    CharSequence string = (CharSequence) carrier.getData();
    StringTokenization dataTokens = new StringTokenization (string);
    TokenSequence targetTokens = new TokenSequence ();
    String tag = backgroundTag;
    String nextTag = backgroundTag;
    Matcher m = sgmlPattern.matcher (string);
    int textStart = 0;
    int textEnd = 0;
    int nextStart = 0;
    boolean done = false;


    logger.fine(sgmlPattern.pattern());
    logger.finer(string.toString());


    while (!done) {
      done = !(m.find());
      if (done)
        textEnd = string.length(); // culotta: changed from string.length()-1 
      else {
        String sgml = m.group();
        logger.finer ("SGML = "+sgml);


        int groupCount = m.groupCount();
        logger.finer(Integer.toString (groupCount));


        if (sgml.charAt(1) == '/')
          nextTag = backgroundTag;
        else{
          //nextTag = m.group(0);
          nextTag = sgml.substring(1, sgml.length()-1);
        }
        logger.finer("nextTag: " + nextTag);


        nextStart = m.end();  // m.end returns one beyond index of last match char
        textEnd = m.start();  // String.subtring does not include index end
        logger.finer ("Text start/end "+textStart+" "+textEnd);
      }
      if (textEnd - textStart > 0) {
        logger.finer ("Tag = "+tag);
        logger.finer ("Target = "+string.subSequence (textStart, textEnd));
        lexer.setCharSequence (string.subSequence (textStart, textEnd));
        while (lexer.hasNext()) {
          lexer.next ();
          int tokStart = textStart + lexer.getStartOffset ();
          int tokEnd = textStart + lexer.getEndOffset ();
          dataTokens.add (new StringSpan (string, tokStart, tokEnd));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;

View Full Code Here

    if (data instanceof Tokenization) {
      // we're done
    } else if (data instanceof TokenSequence) {
      StringBuffer buf = new StringBuffer ();
      TokenSequence ts = (TokenSequence) data;
      StringTokenization spans = new StringTokenization (buf);  // I can use a StringBuffer as the doc! Awesome!


      for (int i = 0; i < ts.size(); i++) {
        Token token = ts.get(i);


        int start = buf.length ();
        buf.append (token.getText());
        int end = buf.length();


        StringSpan span = new StringSpan (buf, start, end);
        span.setFeatures (token.getFeatures ());
        span.setProperties (token.getProperties ());


        spans.add (span);
        buf.append (" ");
      }


      carrier.setData (spans);
    } else {

View Full Code Here

    }




    public Instance pipe(Instance carrier)
    {
      StringTokenization ts =  (StringTokenization) carrier.getData();
      StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ());
      final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet();
      LabelSequence labelSeq = new LabelSequence(dict);
      Label start = dict.lookupLabel ("start");
      Label notstart = dict.lookupLabel ("notstart");


      boolean lastWasSpace = true;
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < ts.size(); i++) {
        StringSpan t = (StringSpan) ts.getSpan(i);
        if (t.getText().equals(" "))
          lastWasSpace = true;
        else {
          sb.append(t.getText());
          newTs.add(t);
          labelSeq.add(lastWasSpace ? "start" : "notstart");
          lastWasSpace = false;
        }
      }
      if (isTargetProcessing())

View Full Code Here

    Object inputData = carrier.getData();
    LabelAlphabet labels;
    LabelSequence target = null;
    String[][] tokens;
    StringBuffer source = new StringBuffer();
    StringTokenization ts = new StringTokenization(source);
    if (inputData instanceof String)
      tokens = parseSentence((String) inputData);
    else if (inputData instanceof String[][])
      tokens = (String[][]) inputData;
    else
      throw new IllegalArgumentException("Not a String; got " + inputData);
    if (isTargetProcessing()) {
      labels = (LabelAlphabet) getTargetAlphabet();
      target = new LabelSequence(labels, tokens.length);
    }
    for (int l = 0; l < tokens.length; l++) {
      int nFeatures;
      if (isTargetProcessing()) {
        if (tokens[l].length < 1)
          throw new IllegalStateException("Missing label at line "
              + l + " instance " + carrier.getName());
        nFeatures = tokens[l].length - 1;
        target.add(tokens[l][nFeatures]);
      } else
        nFeatures = tokens[l].length;
      int start = source.length();
      String word = makeText(tokens[l]);
      source.append(word + " ");
      Token tok = new StringSpan(source, start, source.length() - 1);
      if (setTokensAsFeatures) {
        for (int f = 0; f < nFeatures; f++)
          tok.setFeatureValue(tokens[l][f], 1.0);
      } else {
        for (int f = 1; f < nFeatures; f++)
          tok.setFeatureValue(tokens[l][f], 1.0);
      }
      ts.add(tok);
    }
    carrier.setData(ts);
    if (isTargetProcessing())
      carrier.setTarget(target);
    return carrier;

View Full Code Here

    LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet(), tokens.length);
    boolean [][] ending = new boolean[3][endings.length];
    boolean [][] endingp1 = new boolean[3][endings.length];
    boolean [][] endingp2 = new boolean[3][endings.length];
    StringBuffer source = saveSource ? new StringBuffer() : null;
    TokenSequence data = new StringTokenization (source);


    String prevLabel = "NOLABEL";
    Pattern ipattern = Pattern.compile ("I-.*");
    String word, tag = null, phrase = null, label = null;


    for (int i = 0; i < tokens.length; i++) {
      if (tokens[i].length() != 0) {
        try {
          String[] features = tokens[i].split (" ");
          int fieldIdx = 0;
          word = features[fieldIdx++]; // .toLowerCase();
          if (doTags) tag = features[fieldIdx++];
          if (doPhrases) phrase = features[fieldIdx++];
          if (isTargetProcessing ()) label = features[fieldIdx++];
        } catch (ArrayIndexOutOfBoundsException e) {
          throw new IllegalArgumentException ("Invalid line "+tokens[i]+" : expected word "
            + (doTags ? ", tag" : "")
            + (doPhrases ? ", phrase" : "")
            + (isTargetProcessing () ? ", target" : "")
            + ".");
        }
      } else {
        word = "-<S>-";
        tag = "-<S>-";
        phrase = "-<S>-";
        label = "O";
      }


      // Transformations
      if (doDigitCollapses) {
        if (word.matches ("19\\d\\d"))
          word = "<YEAR>";
        else if (word.matches ("19\\d\\ds"))
          word = "<YEARDECADE>";
        else if (word.matches ("19\\d\\d-\\d+"))
          word = "<YEARSPAN>";
        else if (word.matches ("\\d+\\\\/\\d"))
          word = "<FRACTION>";
        else if (word.matches ("\\d[\\d,\\.]*"))
          word = "<DIGITS>";
        else if (word.matches ("19\\d\\d-\\d\\d-\\d--d"))
          word = "<DATELINEDATE>";
        else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d"))
          word = "<DATELINEDATE>";
        else if (word.matches (".*-led"))
          word = "<LED>";
        else if (word.matches (".*-sponsored"))
          word = "<LED>";
      }


      if (doDowncasing)
        word = word.toLowerCase();


      int start = source.length ();


      if (saveSource) {
        if (word.equals ("-<S>-")) source.append ("\n\n");
        source.append (word); source.append (" ");
      }


      Token token = new StringSpan (source, start, source.length () - 1);


      // Word and tag unigram at current time
      if (doSpelling) {
        for (int j = 0; j < endings.length; j++) {
          ending[2][j] = ending[1][j];
          ending[1][j] = ending[0][j];
          ending[0][j] = endingPatterns[j].matcher(word).matches();
          if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1);
        }
      }


      if (doTags) {
        token.setFeatureValue ("T="+tag, 1);
      }


      if (doPhrases) {
        token.setFeatureValue ("P="+phrase, 1);
      }


      data.add (token);


      if (isTargetProcessing ()) {
        // Change so each segment always begins with a "B-",
        // even if previous token did not have this label.
        String oldLabel = label;

View Full Code Here


  public Instance pipe (Instance carrier)
  {
    CharSequence string = (CharSequence) carrier.getData();
    lexer.setCharSequence (string);
    TokenSequence ts = new StringTokenization (string);
    while (lexer.hasNext()) {
      lexer.next();
      ts.add (new StringSpan (string, lexer.getStartOffset (), lexer.getEndOffset ()));
    }
    carrier.setData(ts);
    return carrier;
  }

View Full Code Here

  }


  public Instance pipe (Instance carrier)
  {
    CharSequence string = (CharSequence) carrier.getData();
    StringTokenization dataTokens = new StringTokenization (string);
    TokenSequence targetTokens = new TokenSequence ();
    String tag = backgroundTag;
    String nextTag = backgroundTag;
    Matcher m = sgmlPattern.matcher (string);
    int textStart = 0;
    int textEnd = 0;
    int nextStart = 0;
    boolean done = false;


    logger.fine(sgmlPattern.pattern());
    logger.finer(string.toString());


    while (!done) {
      done = !(m.find());
      if (done)
        textEnd = string.length(); // culotta: changed from string.length()-1 
      else {
        String sgml = m.group();
        logger.finer ("SGML = "+sgml);


        int groupCount = m.groupCount();
        logger.finer(Integer.toString (groupCount));


        if (sgml.charAt(1) == '/')
          nextTag = backgroundTag;
        else{
          //nextTag = m.group(0);
          nextTag = sgml.substring(1, sgml.length()-1);
        }
        logger.finer("nextTag: " + nextTag);


        nextStart = m.end();  // m.end returns one beyond index of last match char
        textEnd = m.start();  // String.subtring does not include index end
        logger.finer ("Text start/end "+textStart+" "+textEnd);
      }
      if (textEnd - textStart > 0) {
        logger.finer ("Tag = "+tag);
        logger.finer ("Target = "+string.subSequence (textStart, textEnd));
        lexer.setCharSequence (string.subSequence (textStart, textEnd));
        while (lexer.hasNext()) {
          lexer.next ();
          int tokStart = textStart + lexer.getStartOffset ();
          int tokEnd = textStart + lexer.getEndOffset ();
          dataTokens.add (new StringSpan (string, tokStart, tokEnd));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;

View Full Code Here


      spans[i] = span;


    }


    StringTokenization tokenization = new StringTokenization (buf);
    tokenization.addAll (spans);
    carrier.setData (tokenization);


    carrier.setTarget (new LabelsAssignment (new LabelsSequence (lbls)));
    return carrier;
  }

View Full Code Here

TOP

Related Classes of cc.mallet.extract.StringTokenization

cc.mallet.extract.pipe.TokenSequence2Tokenization

cc.mallet.fst.tests.TestMEMM$TestMEMMTokenSequenceRemoveSpaces

cc.mallet.grmm.learning.GenericAcrfData2TokenSequence

cc.mallet.grmm.test.TestGenericAcrfData2TokenSequence

cc.mallet.pipe.CharSequence2TokenSequence

cc.mallet.pipe.SGML2TokenSequence

cc.mallet.pipe.SimpleTaggerSentence2StringTokenization

cc.mallet.share.casutton.ner.ConllNer2003Sentence2TokenSequence

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.