Package edu.stanford.nlp.process

Examples of edu.stanford.nlp.process.CoreLabelTokenFactory


  }

  // TODO replace with GrammaticalStructure#readCoNLLGrammaticalStructureCollection
  public static void loadConllFile(String inFile, List<CoreMap> sents, List<DependencyTree> trees, boolean labeled)
  {
    CoreLabelTokenFactory tf = new CoreLabelTokenFactory(false);

    BufferedReader reader = null;
    try {
      reader = IOUtils.readerFromString(inFile);

      CoreMap sentence = new CoreLabel();
      List<CoreLabel> sentenceTokens = new ArrayList<>();

      DependencyTree tree = new DependencyTree();

      for (String line : IOUtils.getLineIterable(reader, false)) {
        String[] splits = line.split("\t");
        if (splits.length < 10) {
          trees.add(tree);
          sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
          sents.add(sentence);

          tree = new DependencyTree();
          sentence = new CoreLabel();
          sentenceTokens = new ArrayList<>();
        } else {
          String word = splits[1],
                  pos = splits[4],
                  depType = splits[7];
          int head = Integer.parseInt(splits[6]);

          CoreLabel token = tf.makeToken(word, 0, 0);
          token.setTag(pos);
          token.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, head);
          token.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, depType);
          sentenceTokens.add(token);
View Full Code Here


    System.out.println();

    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok =
        tokenizerFactory.getTokenizer(new StringReader(sent2));
    List<CoreLabel> rawWords2 = tok.tokenize();
    parse = lp.apply(rawWords2);
View Full Code Here

        this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(flags.tokenFactory).newInstance();
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    } else {
      this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
    }

    init(flags, this.tokenFactory, this.tokensAnnotationClassName);
  }
View Full Code Here

        this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(tokenFactoryClassName).newInstance();
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    } else {
      this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
    }
    init(mapString, this.tokenFactory, this.tokensAnnotationClassName);
  }
View Full Code Here

    }
    init(mapString, this.tokenFactory, this.tokensAnnotationClassName);
  }

  public void init(String map) {
    init(map, (CoreTokenFactory<IN>) new CoreLabelTokenFactory(),
        "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation");
  }
View Full Code Here

    protected boolean splitCompoundOption = false;
    protected boolean splitVerbOption = false;
    protected boolean splitContractionOption = false;

    public static TokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() {
      return new SpanishTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), ANCORA_OPTS);
    }
View Full Code Here

            createInstance(flags.featureFactoriesArgs.get(i));
        this.featureFactories.add(indFeatureFactory);
      }
    }
    if (flags.tokenFactory == null) {
      tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
    } else {
      this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs);
    //   this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(flags.tokenFactory).newInstance();
    }
    // } catch (Exception e) {
View Full Code Here

    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
                     "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
View Full Code Here

      }
    }
   
    switch(type) {
    case Spanish:
      factory = SpanishTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
     
    case French:
      factory = FrenchTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
     
    case Whitespace:
      boolean eolIsSignificant = Boolean.valueOf(props.getProperty(EOL_PROPERTY, "false"));
      eolIsSignificant = eolIsSignificant || Boolean.valueOf(props.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
      factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<CoreLabel> (new CoreLabelTokenFactory(), eolIsSignificant);
      break;
     
    case English:
    case German:
      factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
     
    case Unspecified:
      System.err.println("TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.");
      factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
     
    default:
      throw new IllegalArgumentException("No valid tokenizer type provided.\n" +
                                         "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" +
View Full Code Here

   *
   * @return A tokenizer
   */
  @Override
  public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
    return SpanishTokenizer.factory(new CoreLabelTokenFactory(),
        "invertible,ptb3Escaping=true,splitAll=true");
  }
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.process.CoreLabelTokenFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.