Package types

Examples of types.Alphabet


public class InternetAds {

  public static void main(String[] args) throws IOException {
    // read in the data.
    ArrayList<ClassificationInstance> allData = (new InternetAdReader(
        new Alphabet(), new Alphabet())).readFile(args[0]);
    StaticUtils.shuffle(allData, 0);
    // randomly split data into training and testing part
    ArrayList<ClassificationInstance>[] tmp = StaticUtils.split(allData,
        200);
    ArrayList<ClassificationInstance> train = tmp[0];
    ArrayList<ClassificationInstance> test = tmp[1];
    Alphabet xA = allData.get(0).xAlphabet;
    Alphabet yA = allData.get(0).yAlphabet;
    System.out.println("num Features = " + allData.get(0).xAlphabet.size());
    LinearClassifier h;
    h = trainAdaBoost(50, train, xA, yA);
    System.out.println("Boost  Train Accuracy = "
        + StaticUtils.computeAccuracy(h, train));
View Full Code Here


    }
  }

  public static void main(String[] args) {
    ArrayList<ClassificationInstance> train = new ArrayList<ClassificationInstance>();
    Alphabet xAlphabet = new Alphabet();
    Alphabet yAlphabet = new Alphabet();
    String[] classes = new String[] { "a", "b" };
    Random r = new Random(10);
    int numFeats = 5;
    double randomFrac = 0.5;
    double missingFrac = 0.5;
View Full Code Here

public class PartOfSpeech {

  public static void main(String[] args) throws IOException {
    // read in the data.
    ArrayList<SequenceInstance> allData = (new PartOfSpeechReader(
        new Alphabet(), new Alphabet())).readFile(args[0]);
    StaticUtils.shuffle(allData, 0);
    // randomly split data into training and testing part
    ArrayList<SequenceInstance>[] tmp = StaticUtils.splitS(allData, 150);
    ArrayList<SequenceInstance> train = tmp[0];
    ArrayList<SequenceInstance> test = tmp[1];
    Alphabet xA = allData.get(0).xAlphabet;
    Alphabet yA = allData.get(0).yAlphabet;
    System.out.println("num Features = " + allData.get(0).xAlphabet.size());
    LinearTagger h;
     h = PartOfSpeech.trainCRF(train, xA, yA);
     System.out.println("CRF    Train Accuracy = "
     + StaticUtils.computeAccuracyS(h, train));
View Full Code Here

public class Newsgroups {

  public static void main(String[] args) throws IOException {
    // read in the data.
    ArrayList<ClassificationInstance> allData = (new NewsgroupsReader(
        new Alphabet(), new Alphabet())).readFile(args[0]);
    StaticUtils.shuffle(allData, 0);
    // randomly split data into training and testing part
    ArrayList<ClassificationInstance>[] tmp = StaticUtils.split(allData,
        (10));
    System.out.println(allData.get(0).xAlphabet.size());
    ArrayList<ClassificationInstance> train = tmp[0];
    ArrayList<ClassificationInstance> test = tmp[1];
    Alphabet xA = allData.get(0).xAlphabet;
    Alphabet yA = allData.get(0).yAlphabet;
    System.out.println("num Features = " + allData.get(0).xAlphabet.size());
    LinearClassifier h;
    h = trainMaxEnt(train, xA, yA);
    // print out accuracy
    System.out.println("MaxEnt Train Accuracy = "
View Full Code Here

      ResourceInstantiationException {

    // extract the document content as a string
    String text = document.getContent().toString();
    ArrayList<Element<Object>> elements = new ArrayList<Element<Object>>();
    Alphabet labelAlphabet = tagger.getYAlphabet();
    FeatureMap features = Factory.newFeatureMap();
    long start = 0;
    boolean wasSpace = true;
    ElementSequence<Element<Object>> sequence;
    SparseVector[] x;
    Object[] y;
    char[] chars;
    int[] labels;
    String chunk, label;
    long end, newStart, newEnd;
    int id;

    for (int i = 0; i < text.length(); i++) {

      if ((text.charAt(i) == ' ') || (text.charAt(i) == '\n')) {

        // if the previous character wasn't a space but this one is,
        // update the end offset and make a temporary token annotation
        if (!wasSpace) {

          end = i;
          id = outputAS.add(start, end, "TempToken", features);
          chars = document.getContent().getContent(start, end)
              .toString().toCharArray();

          for (Character c : chars) {

            // extract the chunk of the character
            chunk = c.toString();
            label = "?";

            // create an element with the chunk and label
            elements.add(new Element<Object>(chunk, label, outputAS
                .get(id)));
          }

          // create the x and y arrays
          x = new SparseVector[elements.size()];
          y = new Object[elements.size()];

          // fill in the labels
          for (int j = 0; j < y.length; j++)
            y[j] = "?";

          // create an element sequence, extract its features and
          // label it
          sequence = new ElementSequence(elements, xAlphabet,
              yAlphabet, x, y, document, inputAS);
          labels = tagger.label(extractor.process(sequence).x);
          elements = new ArrayList<Element<Object>>();

          if (labels.length == chars.length) {

            // update the new start offset
            newStart = start;

            for (int j = 1; j < chars.length - 1; j++) {

              // if a middle character is labeled "B",
              // update the new end offset, create a token
              // annotation
              // and update the new start offset
              if (labels[j] == labelAlphabet.lookupObject("B")) {

                newEnd = start + j;
                outputAS.add(newStart, newEnd, labelType,
                    features);
                newStart = newEnd;
View Full Code Here

    // load the pipeline and extractor
    pipeline = (ConditionalSerialAnalyserController)
    PersistenceManager.loadObjectFromUrl(path2xgapp);
    extractor = new GroovyPreprocessor(path2rules);
    sequences = new ArrayList<SequenceInstance>();
    xAlphabet = new Alphabet();
    yAlphabet = new Alphabet();
    this.path2xgapp = path2xgapp;
  }
View Full Code Here

            + "used to create the model");
    }
    extractor = (GroovyPreprocessor) in.readObject();
    tagger = (LinearTagger) in.readObject();
    in.close();
    xAlphabet = new Alphabet();
    yAlphabet = new Alphabet();
  }
View Full Code Here

    Factory.deleteResource(pipeline);

    // extract sentences from text
    AnnotationSet sentences = inputAS.get(sequenceType);
    ArrayList<Element<Object>> elements = new ArrayList<Element<Object>>();
    Alphabet labelAlphabet = tagger.getYAlphabet();
    OffsetComparator oc = new OffsetComparator();
    SparseVector[] x;
    Object[] y;
    int[] labels;
    ElementSequence<Element<Object>> sequence;
    Element<Object> element;
    ArrayList<Annotation> tokens;
    Iterator<Annotation> iterator;
    String chunk, label;

    for (Annotation sentence : sentences) {

      // extract tokens from sentence
      tokens = new ArrayList(inputAS.get(
          sentence.getStartNode().getOffset(),
          sentence.getEndNode().getOffset()).get(elementType));
      Collections.sort(tokens, oc);

      for (Annotation token : tokens) {

        // extract the chunk of the token
        chunk = document.getContent().getContent(
            token.getStartNode().getOffset(),
            token.getEndNode().getOffset()).toString();
        label = "?";

        // create an element with the chunk and label
        element = new Element<Object>(chunk, label, token);
        elements.add(element);
      }

      x = new SparseVector[elements.size()];
      y = new Object[elements.size()];

      // fill in the labels
      for (int i = 0; i < y.length; i++)
        y[i] = "?";

      // create an element sequence and extract the features
      sequence = new ElementSequence(elements, xAlphabet, yAlphabet, x,
          y, document, inputAS);
      elements = new ArrayList<Element<Object>>();

      // extract the labels for the tokens in the sentence
      labels = tagger.label(extractor.process(sequence).x);
      iterator = tokens.iterator();

      for (int i = 0; i < labels.length; i++) {

        // add the labels as new annotation features
        if (iterator.hasNext())
          iterator.next().getFeatures().put(labelType,
              labelAlphabet.lookupIndex(labels[i]));

        // shouldn't happen, but who knows?
        else
          System.err
          .println("Unexpected number of tokens in sentence!");
View Full Code Here

    Factory.deleteResource(pipeline);
   
    // extract sentences from text
    AnnotationSet sentences = inputAS.get(sequenceType);
    ArrayList<Element<Object>> elements = new ArrayList<Element<Object>>();
    Alphabet labelAlphabet = tagger.getYAlphabet();
    OffsetComparator oc = new OffsetComparator();
    SparseVector[] x;
    Object[] y;
    int[] labels;
    ElementSequence<Element<Object>> sequence;
    Element<Object> element;
    ArrayList<Annotation> tokens;
    String chunk, label;
   
    for (Annotation sentence : sentences) {
     
      // extract tokens from sentence
      tokens = new ArrayList(inputAS.get(sentence.getStartNode().getOffset(),
          sentence.getEndNode().getOffset()).get(elementType));
      Collections.sort(tokens, oc);
     
      for (Annotation token : tokens) {

        // extract the chunk of the token
        chunk = document.getContent().getContent(token.getStartNode().getOffset(),
            token.getEndNode().getOffset()).toString();
        label = "?";

        // create an element with the chunk and label
        element = new Element<Object>(chunk, label, token);
        elements.add(element);
      }
     
      x = new SparseVector[elements.size()];
      y = new Object[elements.size()];

      // fill in the labels
      for (int i = 0; i < y.length; i++)
        y[i] = "?";

      // create an element sequence and extract the features
      sequence = new ElementSequence(elements, xAlphabet, yAlphabet, x,
          y, document, inputAS);
      elements = new ArrayList<Element<Object>>();

      // extract the labels for the tokens in the sentence
      labels = tagger.label(extractor.process(sequence).x);
      FeatureMap features = Factory.newFeatureMap();
      long start, end;
     
      if (labels.length == tokens.size()) {
       
        for (int i = 0; i < tokens.size(); i++) {
         
          if (labels[i] == labelAlphabet.lookupObject("B-NP")) {
            start = tokens.get(i).getStartNode().getOffset();
            end = tokens.get(i).getEndNode().getOffset();
           
            while ((i + 1 < tokens.size()) &&
                (labels[i + 1] == labelAlphabet.lookupObject("I-NP"))) {
              end = tokens.get(i + 1).getEndNode().getOffset();
              i++;
            }
           
            outputAS.add(start, end, nChunkType, features);
          }
         
          else if (labels[i] == labelAlphabet.lookupObject("B-VP")) {
            start = tokens.get(i).getStartNode().getOffset();
            end = tokens.get(i).getEndNode().getOffset();
           
            while ((i + 1 < tokens.size()) &&
                (labels[i + 1] == labelAlphabet.lookupObject("I-VP"))) {
              end = tokens.get(i + 1).getEndNode().getOffset();
              i++;
            }
           
            outputAS.add(start, end, vChunkType, features);
View Full Code Here

    Factory.deleteResource(pipeline);

    // extract sentences from text
    AnnotationSet sentences = inputAS.get(sequenceType);
    ArrayList<Element<Object>> elements = new ArrayList<Element<Object>>();
    Alphabet labelAlphabet = tagger.getYAlphabet();
    OffsetComparator oc = new OffsetComparator();
    SparseVector[] x;
    Object[] y;
    int[] labels;
    ElementSequence<Element<Object>> sequence;
    Element<Object> element;
    ArrayList<Annotation> tokens;
    String chunk, label;

    for (Annotation sentence : sentences) {

      // extract tokens from sentence
      tokens = new ArrayList(inputAS.get(sentence.getStartNode().getOffset(),
          sentence.getEndNode().getOffset()).get(elementType));
      Collections.sort(tokens, oc);

      for (Annotation token : tokens) {

        // extract the chunk of the token
        chunk = document.getContent().getContent(token.getStartNode().getOffset(),
            token.getEndNode().getOffset()).toString();
        label = "?";

        // create an element with the chunk and label
        element = new Element<Object>(chunk, label, token);
        elements.add(element);
      }

      x = new SparseVector[elements.size()];
      y = new Object[elements.size()];

      // fill in the labels
      for (int i = 0; i < y.length; i++)
        y[i] = "?";

      // create an element sequence and extract the features
      sequence = new ElementSequence(elements, xAlphabet, yAlphabet, x,
          y, document, inputAS);
      elements = new ArrayList<Element<Object>>();

      // extract the labels for the tokens in the sentence
      labels = tagger.label(extractor.process(sequence).x);
      FeatureMap features = Factory.newFeatureMap();
      long start, end;

      if (labels.length == tokens.size()) {

        for (int i = 0; i < tokens.size(); i++) {

          if (labels[i] == labelAlphabet.lookupObject("B-DAT")) {
            start = tokens.get(i).getStartNode().getOffset();
            end = tokens.get(i).getEndNode().getOffset();

            while ((i + 1 < tokens.size()) &&
                (labels[i + 1] == labelAlphabet.lookupObject("I-DAT"))) {
              end = tokens.get(i + 1).getEndNode().getOffset();
              i++;
            }

            outputAS.add(start, end, datType, features);
          }

          else if (labels[i] == labelAlphabet.lookupObject("B-LOC")) {
            start = tokens.get(i).getStartNode().getOffset();
            end = tokens.get(i).getEndNode().getOffset();

            while ((i + 1 < tokens.size()) &&
                (labels[i + 1] == labelAlphabet.lookupObject("I-LOC"))) {
              end = tokens.get(i + 1).getEndNode().getOffset();
              i++;
            }

            outputAS.add(start, end, locType, features);
          }

          else if (labels[i] == labelAlphabet.lookupObject("B-ORG")) {
            start = tokens.get(i).getStartNode().getOffset();
            end = tokens.get(i).getEndNode().getOffset();

            while ((i + 1 < tokens.size()) &&
                (labels[i + 1] == labelAlphabet.lookupObject("I-ORG"))) {
              end = tokens.get(i + 1).getEndNode().getOffset();
              i++;
            }

            outputAS.add(start, end, orgType, features);
          }

          else if (labels[i] == labelAlphabet.lookupObject("B-PER")) {
            start = tokens.get(i).getStartNode().getOffset();
            end = tokens.get(i).getEndNode().getOffset();

            while ((i + 1 < tokens.size()) &&
                (labels[i + 1] == labelAlphabet.lookupObject("I-PER"))) {
              end = tokens.get(i + 1).getEndNode().getOffset();
              i++;
            }

            outputAS.add(start, end, perType, features);
View Full Code Here

TOP

Related Classes of types.Alphabet

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.