Package org.apache.nutch.analysis

Examples of org.apache.nutch.analysis.AnalyzerFactory


    perm = new Path(FileOutputFormat.getOutputPath(job), name);
    temp = job.getLocalPath("index/_"  +
                      Integer.toString(new Random().nextInt()));

    fs.delete(perm, true); // delete old, if any
    analyzerFactory = new AnalyzerFactory(job);
    writer = new IndexWriter(
        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
        new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED);

    writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
View Full Code Here


      final Path temp = job.getLocalPath("index/_"
        + Integer.toString(new Random().nextInt()));

      fs.delete(perm, true); // delete old, if any

      final AnalyzerFactory factory = new AnalyzerFactory(job);
      final IndexWriter writer = // build locally first
      new IndexWriter(
        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
        new NutchDocumentAnalyzer(job), true,
        new MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));

      writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
      writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
      writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs",
        Integer.MAX_VALUE));
      writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
      writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
      writer.setInfoStream(LogUtil.getInfoStream(LOG));
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());

      return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
        boolean closed;

        public void write(WritableComparable key, LuceneDocumentWrapper value)
          throws IOException { // unwrap & index doc
          Document doc = value.get();
          NutchAnalyzer analyzer = factory.get(doc.get("lang"));
          if (LOG.isInfoEnabled()) {
            LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]"
              + " with analyzer " + analyzer);
          }
          writer.addDocument(doc, analyzer);
View Full Code Here

      final Path temp = job.getLocalPath("index/_"
        + Integer.toString(new Random().nextInt()));

      fs.delete(perm, true); // delete old, if any

      final AnalyzerFactory factory = new AnalyzerFactory(job);
      final IndexWriter writer = // build locally first
      new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
        new NutchDocumentAnalyzer(job), true);

      writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
      writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
      writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs",
        Integer.MAX_VALUE));
      writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
      writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
      writer.setInfoStream(LogUtil.getInfoStream(LOG));
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());

      return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
        boolean closed;

        public void write(WritableComparable key, LuceneDocumentWrapper value)
          throws IOException { // unwrap & index doc
          Document doc = value.get();
          NutchAnalyzer analyzer = factory.get(doc.get("lang"));
          if (LOG.isInfoEnabled()) {
            LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]"
              + " with analyzer " + analyzer);
          }
          writer.addDocument(doc, analyzer);
View Full Code Here

    perm = new Path(FileOutputFormat.getOutputPath(job), name);
    temp = job.getLocalPath("index/_"  +
                      Integer.toString(new Random().nextInt()));

    fs.delete(perm, true); // delete old, if any
    analyzerFactory = new AnalyzerFactory(job);
    writer = new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
        new NutchDocumentAnalyzer(job), true);

    writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
    writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
View Full Code Here

      final Path temp = job.getLocalPath("index/_"
        + Integer.toString(new Random().nextInt()));

      fs.delete(perm, true); // delete old, if any

      final AnalyzerFactory factory = new AnalyzerFactory(job);
      final IndexWriter writer = // build locally first
      new IndexWriter(
        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
        new NutchDocumentAnalyzer(job), true,
        new MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));

      writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
      writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
      writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs",
        Integer.MAX_VALUE));
      writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
      writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
      writer.setInfoStream(LogUtil.getInfoStream(LOG));
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());

      return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
        boolean closed;

        public void write(WritableComparable key, LuceneDocumentWrapper value)
          throws IOException { // unwrap & index doc
          Document doc = value.get();
          NutchAnalyzer analyzer = factory.get(doc.get("lang"));
          if (LOG.isInfoEnabled()) {
            LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]"
              + " with analyzer " + analyzer);
          }
          writer.addDocument(doc, analyzer);
View Full Code Here

    perm = new Path(FileOutputFormat.getOutputPath(job), name);
    temp = job.getLocalPath("index/_"  +
                      Integer.toString(new Random().nextInt()));

    fs.delete(perm, true); // delete old, if any
    analyzerFactory = new AnalyzerFactory(job);
    writer = new IndexWriter(
        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
        new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED);

    writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
View Full Code Here

TOP

Related Classes of org.apache.nutch.analysis.AnalyzerFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.