Examples of LineFileDocs

org.apache.lucene.util.LineFileDocs
Minimal port of benchmark's LneDocSource + DocMaker, so tests can enum docs from a line file created by benchmark's WriteLineDoc task

Examples of org.apache.lucene.util.LineFileDocs

  
  // Build FST for all unique terms in the test line docs
  // file, up until a time limit
  public void testRealTerms() throws Exception {


    final LineFileDocs docs = new LineFileDocs(random(), defaultCodecSupportsDocValues());
    final int RUN_TIME_MSEC = atLeast(500);
    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final File tempDir = _TestUtil.getTempDir("fstlines");
    final Directory dir = newFSDirectory(tempDir);
    final IndexWriter writer = new IndexWriter(dir, conf);
    final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC;
    Document doc;
    int docCount = 0;
    while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) {
      writer.addDocument(doc);
      docCount++;
    }
    IndexReader r = DirectoryReader.open(writer, true);
    writer.close();

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

    }
  }


  @Ignore
  public void testWiki() throws Exception {
    final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false);
    // Skip header:
    lfd.nextDoc();
    FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random()));
    sug.build(new InputIterator() {


        private int count;


        @Override
        public long weight() {
          return 1;
        }


        @Override
        public Comparator<BytesRef> getComparator() {
          return null;
        }


        @Override
        public BytesRef next() {
          Document doc;
          try {
            doc = lfd.nextDoc();
          } catch (IOException ioe) {
            throw new RuntimeException(ioe);
          }
          if (doc == null) {
            return null;

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

      assertEquals(new BytesRef("for all the fish"), results.get(2).payload);
    }
  }
  
  public void testRandomRealisticKeys() throws IOException {
    LineFileDocs lineFile = new LineFileDocs(random());
    Map<String, Long> mapping = new HashMap<>();
    List<Input> keys = new ArrayList<>();
    
    int howMany = atLeast(100); // this might bring up duplicates
    for (int i = 0; i < howMany; i++) {
      Document nextDoc = lineFile.nextDoc();
      String title = nextDoc.getField("title").stringValue();
      int randomWeight = random().nextInt(100);
      keys.add(new Input(title, randomWeight));
      if (!mapping.containsKey(title) || mapping.get(title) < randomWeight) {
          mapping.put(title, Long.valueOf(randomWeight));
      }
    }
    AnalyzingSuggester analyzingSuggester = new AnalyzingSuggester(new MockAnalyzer(random()), new MockAnalyzer(random()),
        AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, random().nextBoolean());
    boolean doPayloads = random().nextBoolean();
    if (doPayloads) {
      List<Input> keysAndPayloads = new ArrayList<>();
      for (Input termFreq : keys) {
        keysAndPayloads.add(new Input(termFreq.term, termFreq.v, new BytesRef(Long.toString(termFreq.v))));
      }
      analyzingSuggester.build(new InputArrayIterator(keysAndPayloads));
    } else {
      analyzingSuggester.build(new InputArrayIterator(keys));  
    }
    
    for (Input termFreq : keys) {
      List<LookupResult> lookup = analyzingSuggester.lookup(termFreq.term.utf8ToString(), false, keys.size());
      for (LookupResult lookupResult : lookup) {
        assertEquals(mapping.get(lookupResult.key), Long.valueOf(lookupResult.value));
        if (doPayloads) {
          assertEquals(lookupResult.payload.utf8ToString(), Long.toString(lookupResult.value));
        } else {
          assertNull(lookupResult.payload);
        }
      }
    }
    
    lineFile.close();
  }

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

    NRTCachingDirectory cachedDir = new NRTCachingDirectory(dir, 2.0, 25.0);
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
    RandomIndexWriter w = new RandomIndexWriter(random(), cachedDir, conf);
    final LineFileDocs docs = new LineFileDocs(random(),
                                               defaultCodecSupportsDocValues());
    final int numDocs = TestUtil.nextInt(random(), 100, 400);


    if (VERBOSE) {
      System.out.println("TEST: numDocs=" + numDocs);
    }


    final List<BytesRef> ids = new ArrayList<>();
    DirectoryReader r = null;
    for(int docCount=0;docCount<numDocs;docCount++) {
      final Document doc = docs.nextDoc();
      ids.add(new BytesRef(doc.get("docid")));
      w.addDocument(doc);
      if (random().nextInt(20) == 17) {
        if (r == null) {
          r = DirectoryReader.open(w.w, false);
        } else {
          final DirectoryReader r2 = DirectoryReader.openIfChanged(r);
          if (r2 != null) {
            r.close();
            r = r2;
          }
        }
        assertEquals(1+docCount, r.numDocs());
        final IndexSearcher s = newSearcher(r);
        // Just make sure search can run; we can't assert
        // totHits since it could be 0
        TopDocs hits = s.search(new TermQuery(new Term("body", "the")), 10);
        // System.out.println("tot hits " + hits.totalHits);
      }
    }


    if (r != null) {
      r.close();
    }


    // Close should force cache to clear since all files are sync'd
    w.close();


    final String[] cachedFiles = cachedDir.listCachedFiles();
    for(String file : cachedFiles) {
      System.out.println("FAIL: cached file " + file + " remains after sync");
    }
    assertEquals(0, cachedFiles.length);
    
    r = DirectoryReader.open(dir);
    for(BytesRef id : ids) {
      assertEquals(1, r.docFreq(new Term("docid", id)));
    }
    r.close();
    cachedDir.close();
    docs.close();
  }

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

  
  // Build FST for all unique terms in the test line docs
  // file, up until a time limit
  public void testRealTerms() throws Exception {


    final LineFileDocs docs = new LineFileDocs(random(), defaultCodecSupportsDocValues());
    final int RUN_TIME_MSEC = atLeast(500);
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));


    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final File tempDir = createTempDir("fstlines");
    final Directory dir = newFSDirectory(tempDir);
    final IndexWriter writer = new IndexWriter(dir, conf);
    final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC;
    Document doc;
    int docCount = 0;
    while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) {
      writer.addDocument(doc);
      docCount++;
    }
    IndexReader r = DirectoryReader.open(writer, true);
    writer.close();

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

    packCount.set(0);


    final long t0 = System.currentTimeMillis();


    Random random = new Random(random().nextLong());
    final LineFileDocs docs = new LineFileDocs(random, defaultCodecSupportsDocValues());
    final File tempDir = createTempDir(testName);
    dir = getDirectory(newMockFSDirectory(tempDir)); // some subclasses rely on this being MDW
    if (dir instanceof BaseDirectoryWrapper) {
      ((BaseDirectoryWrapper) dir).setCheckIndexOnClose(false); // don't double-checkIndex, we do it ourselves.
    }
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
    conf.setInfoStream(new FailOnNonBulkMergesInfoStream());
    if (conf.getMergePolicy() instanceof MockRandomMergePolicy) {
      ((MockRandomMergePolicy)conf.getMergePolicy()).setDoNonBulkMerges(false);
    }


    if (LuceneTestCase.TEST_NIGHTLY) {
      // newIWConfig makes smallish max seg size, which
      // results in tons and tons of segments for this test
      // when run nightly:
      MergePolicy mp = conf.getMergePolicy();
      if (mp instanceof TieredMergePolicy) {
        ((TieredMergePolicy) mp).setMaxMergedSegmentMB(5000.);
      } else if (mp instanceof LogByteSizeMergePolicy) {
        ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1000.);
      } else if (mp instanceof LogMergePolicy) {
        ((LogMergePolicy) mp).setMaxMergeDocs(100000);
      }
    }


    conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
      @Override
      public void warm(AtomicReader reader) throws IOException {
        if (VERBOSE) {
          System.out.println("TEST: now warm merged reader=" + reader);
        }
        warmed.put(((SegmentReader) reader).core, Boolean.TRUE);
        final int maxDoc = reader.maxDoc();
        final Bits liveDocs = reader.getLiveDocs();
        int sum = 0;
        final int inc = Math.max(1, maxDoc/50);
        for(int docID=0;docID<maxDoc;docID += inc) {
          if (liveDocs == null || liveDocs.get(docID)) {
            final Document doc = reader.document(docID);
            sum += doc.getFields().size();
          }
        }


        IndexSearcher searcher = newSearcher(reader);
        sum += searcher.search(new TermQuery(new Term("body", "united")), 10).totalHits;


        if (VERBOSE) {
          System.out.println("TEST: warm visited " + sum + " fields");
        }
      }
      });


    if (VERBOSE) {
      conf.setInfoStream(new PrintStreamInfoStream(System.out) {
          @Override
          public void message(String component, String message) {
            if ("TP".equals(component)) {
              return; // ignore test points!
            }
            super.message(component, message);
          }
        });
    }
    writer = new IndexWriter(dir, conf);
    TestUtil.reduceOpenFiles(writer);


    final ExecutorService es = random().nextBoolean() ? null : Executors.newCachedThreadPool(new NamedThreadFactory(testName));


    doAfterWriter(es);


    final int NUM_INDEX_THREADS = TestUtil.nextInt(random(), 2, 4);


    final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : RANDOM_MULTIPLIER;


    final Set<String> delIDs = Collections.synchronizedSet(new HashSet<String>());
    final Set<String> delPackIDs = Collections.synchronizedSet(new HashSet<String>());
    final List<SubDocs> allSubDocs = Collections.synchronizedList(new ArrayList<SubDocs>());


    final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000;


    final Thread[] indexThreads = launchIndexingThreads(docs, NUM_INDEX_THREADS, stopTime, delIDs, delPackIDs, allSubDocs);


    if (VERBOSE) {
      System.out.println("TEST: DONE start " + NUM_INDEX_THREADS + " indexing threads [" + (System.currentTimeMillis()-t0) + " ms]");
    }


    // Let index build up a bit
    Thread.sleep(100);


    doSearching(es, stopTime);


    if (VERBOSE) {
      System.out.println("TEST: all searching done [" + (System.currentTimeMillis()-t0) + " ms]");
    }
    
    for(int thread=0;thread<indexThreads.length;thread++) {
      indexThreads[thread].join();
    }


    if (VERBOSE) {
      System.out.println("TEST: done join indexing threads [" + (System.currentTimeMillis()-t0) + " ms]; addCount=" + addCount + " delCount=" + delCount);
    }


    final IndexSearcher s = getFinalSearcher();
    if (VERBOSE) {
      System.out.println("TEST: finalSearcher=" + s);
    }


    assertFalse(failed.get());


    boolean doFail = false;


    // Verify: make sure delIDs are in fact deleted:
    for(String id : delIDs) {
      final TopDocs hits = s.search(new TermQuery(new Term("docid", id)), 1);
      if (hits.totalHits != 0) {
        System.out.println("doc id=" + id + " is supposed to be deleted, but got " + hits.totalHits + " hits; first docID=" + hits.scoreDocs[0].doc);
        doFail = true;
      }
    }


    // Verify: make sure delPackIDs are in fact deleted:
    for(String id : delPackIDs) {
      final TopDocs hits = s.search(new TermQuery(new Term("packID", id)), 1);
      if (hits.totalHits != 0) {
        System.out.println("packID=" + id + " is supposed to be deleted, but got " + hits.totalHits + " matches");
        doFail = true;
      }
    }


    // Verify: make sure each group of sub-docs are still in docID order:
    for(SubDocs subDocs : allSubDocs) {
      TopDocs hits = s.search(new TermQuery(new Term("packID", subDocs.packID)), 20);
      if (!subDocs.deleted) {
        // We sort by relevance but the scores should be identical so sort falls back to by docID:
        if (hits.totalHits != subDocs.subIDs.size()) {
          System.out.println("packID=" + subDocs.packID + ": expected " + subDocs.subIDs.size() + " hits but got " + hits.totalHits);
          doFail = true;
        } else {
          int lastDocID = -1;
          int startDocID = -1;
          for(ScoreDoc scoreDoc : hits.scoreDocs) {
            final int docID = scoreDoc.doc;
            if (lastDocID != -1) {
              assertEquals(1+lastDocID, docID);
            } else {
              startDocID = docID;
            }
            lastDocID = docID;
            final Document doc = s.doc(docID);
            assertEquals(subDocs.packID, doc.get("packID"));
          }


          lastDocID = startDocID - 1;
          for(String subID : subDocs.subIDs) {
            hits = s.search(new TermQuery(new Term("docid", subID)), 1);
            assertEquals(1, hits.totalHits);
            final int docID = hits.scoreDocs[0].doc;
            if (lastDocID != -1) {
              assertEquals(1+lastDocID, docID);
            }
            lastDocID = docID;
          }
        }
      } else {
        // Pack was deleted -- make sure its docs are
        // deleted.  We can't verify packID is deleted
        // because we can re-use packID for update:
        for(String subID : subDocs.subIDs) {
          assertEquals(0, s.search(new TermQuery(new Term("docid", subID)), 1).totalHits);
        }
      }
    }


    // Verify: make sure all not-deleted docs are in fact
    // not deleted:
    final int endID = Integer.parseInt(docs.nextDoc().get("docid"));
    docs.close();


    for(int id=0;id<endID;id++) {
      String stringID = ""+id;
      if (!delIDs.contains(stringID)) {
        final TopDocs hits = s.search(new TermQuery(new Term("docid", stringID)), 1);

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

    }
  }


  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {


    final LineFileDocs docs = new LineFileDocs(random);
    Document doc = null;
    Field field = null, currentField = null;
    StringReader bogus = new StringReader("");
    if (iw != null) {
      doc = new Document();
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      if (random.nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(random.nextBoolean());
        ft.setStoreTermVectorPositions(random.nextBoolean());
        if (ft.storeTermVectorPositions() && !OLD_FORMAT_IMPERSONATION_IS_ACTIVE) {
          ft.setStoreTermVectorPayloads(random.nextBoolean());
        }
      }
      if (random.nextBoolean()) {
        ft.setOmitNorms(true);
      }
      String pf = TestUtil.getPostingsFormat("dummy");
      boolean supportsOffsets = !doesntSupportOffsets.contains(pf);
      switch(random.nextInt(4)) {
        case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break;
        case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
        case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
        default:
                if (supportsOffsets && offsetsAreCorrect) {
                  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                } else {
                  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                }
      }
      currentField = field = new Field("dummy", bogus, ft);
      doc.add(currentField);
    }
    
    try {
      for (int i = 0; i < iterations; i++) {
        String text;
        
        if (random.nextInt(10) == 7) {
          // real data from linedocs
          text = docs.nextDoc().get("body");
          if (text.length() > maxWordLength) {
            
            // Take a random slice from the text...:
            int startPos = random.nextInt(text.length() - maxWordLength);
            if (startPos > 0 && Character.isLowSurrogate(text.charAt(startPos))) {

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

    assertNull(reader.termPositionsEnum(new Term("not-in-index", "foo")));
    assertNull(reader.terms("not-in-index"));
  }
  
  public void testDuellMemIndex() throws IOException {
    LineFileDocs lineFileDocs = new LineFileDocs(random());
    int numDocs = atLeast(10);
    MemoryIndex memory = new MemoryIndex(random().nextBoolean(),  random().nextInt(50) * 1024 * 1024);
    for (int i = 0; i < numDocs; i++) {
      Directory dir = newDirectory();
      MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
      mockAnalyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
      IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random(), TEST_VERSION_CURRENT, mockAnalyzer));
      Document nextDoc = lineFileDocs.nextDoc();
      Document doc = new Document();
      for (IndexableField field : nextDoc.getFields()) {
        if (field.fieldType().indexed()) {
          doc.add(field);
          if (random().nextInt(3) == 0) {
            doc.add(field);  // randomly add the same field twice
          }
        }
      }
      
      writer.addDocument(doc);
      writer.close();
      for (IndexableField field : doc.getFields()) {
          memory.addField(field.name(), ((Field)field).stringValue(), mockAnalyzer);  
      }
      DirectoryReader competitor = DirectoryReader.open(dir);
      AtomicReader memIndexReader= (AtomicReader) memory.createSearcher().getIndexReader();
      duellReaders(competitor, memIndexReader);
      IOUtils.close(competitor, memIndexReader);
      memory.reset();
      dir.close();
    }
    lineFileDocs.close();
  }

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

    final AtomicReference<IndexWriter> writerRef = new AtomicReference<>();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));


    writerRef.set(new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)));
    final LineFileDocs docs = new LineFileDocs(random());
    final Thread[] threads = new Thread[threadCount];
    final int iters = atLeast(100);
    final AtomicBoolean failed = new AtomicBoolean();
    final Lock rollbackLock = new ReentrantLock();
    final Lock commitLock = new ReentrantLock();
    for(int threadID=0;threadID<threadCount;threadID++) {
      threads[threadID] = new Thread() {
          @Override
          public void run() {
            for(int iter=0;iter<iters && !failed.get();iter++) {
              //final int x = random().nextInt(5);
              final int x = random().nextInt(3);
              try {
                switch(x) {
                case 0:
                  rollbackLock.lock();
                  if (VERBOSE) {
                    System.out.println("\nTEST: " + Thread.currentThread().getName() + ": now rollback");
                  }
                  try {
                    writerRef.get().rollback();
                    if (VERBOSE) {
                      System.out.println("TEST: " + Thread.currentThread().getName() + ": rollback done; now open new writer");
                    }
                    writerRef.set(new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))));
                  } finally {
                    rollbackLock.unlock();
                  }
                  break;
                case 1:
                  commitLock.lock();
                  if (VERBOSE) {
                    System.out.println("\nTEST: " + Thread.currentThread().getName() + ": now commit");
                  }
                  try {
                    if (random().nextBoolean()) {
                      writerRef.get().prepareCommit();
                    }
                    writerRef.get().commit();
                  } catch (AlreadyClosedException ace) {
                    // ok
                  } catch (NullPointerException npe) {
                    // ok
                  } finally {
                    commitLock.unlock();
                  }
                  break;
                case 2:
                  if (VERBOSE) {
                    System.out.println("\nTEST: " + Thread.currentThread().getName() + ": now add");
                  }
                  try {
                    writerRef.get().addDocument(docs.nextDoc());
                  } catch (AlreadyClosedException ace) {
                    // ok
                  } catch (NullPointerException npe) {
                    // ok
                  } catch (AssertionError ae) {

View Full Code Here

Examples of org.apache.lucene.util.LineFileDocs

   * populates a writer with random stuff. this must be fully reproducable with the seed!
   */
  public static void createRandomIndex(int numdocs, RandomIndexWriter writer, long seed) throws IOException {
    Random random = new Random(seed);
    // primary source for our data is from linefiledocs, its realistic.
    LineFileDocs lineFileDocs = new LineFileDocs(random);


    // TODO: we should add other fields that use things like docs&freqs but omit positions,
    // because linefiledocs doesn't cover all the possibilities.
    for (int i = 0; i < numdocs; i++) {
      Document document = lineFileDocs.nextDoc();
      // grab the title and add some SortedSet instances for fun
      String title = document.get("titleTokenized");
      String split[] = title.split("\\s+");
      for (String trash : split) {
        document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash)));
      }
      // add a numeric dv field sometimes
      document.removeFields("sparsenumeric");
      if (random.nextInt(4) == 2) {
        document.add(new NumericDocValuesField("sparsenumeric", random.nextInt()));
      }
      // add sortednumeric sometimes
      document.removeFields("sparsesortednum");
      if (random.nextInt(5) == 1) {
        document.add(new SortedNumericDocValuesField("sparsesortednum", random.nextLong()));
        if (random.nextBoolean()) {
          document.add(new SortedNumericDocValuesField("sparsesortednum", random.nextLong()));
        }
      }
      writer.addDocument(document);
    }
    
    lineFileDocs.close();
  }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.