Package org.apache.lucene.document

Examples of org.apache.lucene.document.Document


   */
  @Override
  public void printEndTag(PageRequest request, PageResponse response)
    throws RegainException
  {
    Document hit = (Document) request.getContextAttribute(ATTR_CURRENT_HIT);
    if (hit == null) {
      throw new RegainException("Tag " + getTagName()
          + " must be inside a list tag!");
    }

View Full Code Here


      boolean removeOldEntry = false;

      // Search the entry for this URL
      Term urlTerm = new Term("url", rawDocument.getUrl());
      Query query = new TermQuery(urlTerm);
      Document doc;
      try {
        setIndexMode(SEARCHING_MODE);
        TopScoreDocCollector collector = TopScoreDocCollector.create(20, false);
        mIndexSearcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        if (hits.length > 0) {
          if (hits.length > 1) {
            for (int i = 1; i < hits.length; i++) {
              markForDeletion(mIndexSearcher.doc(hits[i].doc));
            }
            mLog.warn("There are duplicate entries (" + hits.length + " in " +
                    "total) for " + rawDocument.getUrl() + ". They will be removed.");
          }

          doc = mIndexSearcher.doc(hits[0].doc);
        } else {
          doc = null;
        }
      } catch (IOException exc) {
        throw new RegainException("Searching old index entry failed for " + rawDocument.getUrl(), exc);
      }

      // If we found an entry, check whether it is up-to-date
      if (doc != null) {
        // Get the last modification date from the document
        Date docLastModified = rawDocument.getLastModified();

        if (docLastModified == null) {
          // We are not able to get the last modification date from the
          // document (this happens with all http-URLs)
          // -> Delete the old entry and create a new one
          mLog.info("Don't know when the document was last modified. " +
                  "Creating a new index entry...");
          removeOldEntry = true;

        } else {
          // Compare the modification date with the one from the index entry
          String asString = doc.get("last-modified");
          if (asString != null) {
            long diff = 86400001L;
            Date indexLastModified = null;
            try {
              indexLastModified = DateTools.stringToDate(asString);
              diff = docLastModified.getTime() - indexLastModified.getTime();
            } catch (ParseException parseException) {
              mLog.warn("Couldn't parse last-modified date from index. Document: " +
                      rawDocument.getUrl(), parseException);
            }
            if (diff > 86400000L) {
              // -> The index entry is not up-to-date -> Delete the old entry
              mLog.info("Index entry is outdated. Creating a new one (source=" +
                      docLastModified + "), (index=" + indexLastModified + "): " +
                      rawDocument.getUrl());
              removeOldEntry = true;

            } else if ((new Date().getTime()) - indexLastModified.getTime() < 86400000L) {
              // Spidering at the same day
              // Due to the fuzziness of the docLastModified.getTime() (day accuracy)
              // we can't be sure whether the document is up-to-date or not
              mLog.info("Index entry is from the same day. Therefore we have to recrawl but do not index the document." +
                      "Creating a new one (source=" + docLastModified + "), (index=" + indexLastModified + "): " +
                      rawDocument.getUrl());

              parseDocument(rawDocument, errorLogger);

              return;

            } else {
              // The index entry is up-to-date

              // Check whether the preparation failed the last time
              boolean failedLastTime = doc.get("preparation-error") != null;
              if (failedLastTime) {
                if (mRetryFailedDocs) {
                  // The entry failed the last time, the user want's a retry
                  // -> We do a retry
                  mLog.info("Retrying preparation of: " + rawDocument.getUrl());
View Full Code Here

          throws RegainException {
    // Dokument erzeugen
    if (mLog.isDebugEnabled()) {
      mLog.debug("Creating document: " + rawDocument.getUrl());
    }
    Document doc = mDocumentFactory.createDocument(rawDocument, errorLogger);

    // Dokument in den Index aufnehmen
    if (doc != null) {
      mAddToIndexProfiler.startMeasuring();
      try {
View Full Code Here

    setIndexMode(READING_MODE);
    int docCount = mIndexReader.numDocs();
    for (int docIdx = 0; docIdx < docCount; docIdx++) {
      if (!mIndexReader.isDeleted(docIdx)) {
        // Document lesen
        Document doc;
        try {
          doc = mIndexReader.document(docIdx);
        } catch (Throwable thr) {
          throw new RegainException("Getting document #" + docIdx + " from index failed.", thr);
        }

        // URL und last-modified holen
        String url = doc.get("url");
        String lastModified = doc.get("last-modified");

        // Prüfen, ob die URL gelöscht werden soll
        boolean shouldBeDeleted;
        if (url != null) {
          // Prüfen, ob dieser Eintrag zum Löschen vorgesehen ist
View Full Code Here

    }

    rawDocument.setMimeType( mimeType );
   
    // Find the preparator that will prepare this URL
    Document doc = null;
    boolean preparatorFound = false;
    ArrayList <Integer>matchingPreperators = new ArrayList <Integer>();
    for (int i = 0; i < mPreparatorArr.length; i++) {
      if (mPreparatorArr[i].accepts(rawDocument)) {
        // This preparator can prepare this URL
View Full Code Here

      throw new RegainException("Preparator " + preparator.getClass().getName()
        + " did not extract the content of " + url);
    }

    // Preparing succeed -> Create the document
    Document doc = createDocument(rawDocument, cleanedContent, title,
                                  summary, metadata, headlines, path, additionalFieldMap);

    // return the document
    return doc;
  }
View Full Code Here

    throws RegainException
  {
    String url = rawDocument.getUrl();

    // Create a new, empty document
    Document doc = new Document();
   
    // Create the auxiliary fields
    // NOTE: We do this at first, because if someone defined an auxiliary field
    //       having the same name as a normal field, then the field will be
    //       overriden by the normal field. This way we can be sure that the
    //       normal fields have the value we expect.
    AuxiliaryField[] auxiliaryFieldArr = mConfig.getAuxiliaryFieldList();
    if (auxiliaryFieldArr != null) {
      for (int i = 0; i < auxiliaryFieldArr.length; i++) {
        RE regex = auxiliaryFieldArr[i].getUrlRegex();
        if (regex.match(url)) {
          String fieldName = auxiliaryFieldArr[i].getFieldName();

          String value = auxiliaryFieldArr[i].getValue();
          if (value == null) {
            // We have no value set -> Extract the value from the regex
            value = regex.getParen(auxiliaryFieldArr[i].getUrlRegexGroup());
          }

          if (value != null) {
            if (auxiliaryFieldArr[i].getToLowerCase()) {
              value = value.toLowerCase();
            }

            if (mLog.isDebugEnabled()) {
              mLog.debug("Adding auxiliary field: " + fieldName + "=" + value);
            }
            boolean store = auxiliaryFieldArr[i].isStored();
            boolean index = auxiliaryFieldArr[i].isIndexed();
            boolean token = auxiliaryFieldArr[i].isTokenized();

            doc.add(new Field(fieldName, value,
                store ? Field.Store.YES : Field.Store.NO,
                index ? (token ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO));
          }
        }
      }
    }
   
    // Add the groups of the document
    if (mCrawlerAccessController != null) {
      String[] groupArr = mCrawlerAccessController.getDocumentGroups(rawDocument);
     
      // Check the Group array
      RegainToolkit.checkGroupArray(mCrawlerAccessController, groupArr);

      // Add the field
      // NOTE: The field "groups" is tokenized, but not stemmed.
      //       See: RegainToolkit.WrapperAnalyzer
      Iterator groupIter = Arrays.asList(groupArr).iterator();
      StringBuilder tokenBuilder = new StringBuilder();
      while (groupIter.hasNext()) {
        tokenBuilder.append((String) groupIter.next());
        tokenBuilder.append(" ");
      }
   
      //doc.add(new Field("groups", new IteratorTokenStream(groupIter)));
      doc.add(new Field("groups", new WhitespaceTokenizer(new StringReader(tokenBuilder.toString()))));
    }

    // Add the URL of the document
    doc.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
   
    // Add the file name (without protocol, drive-letter and path)
    String filenameWithVariants = RegainToolkit.urlToWhitespacedFileName(url);
    doc.add(new Field("filename", new WhitespaceTokenizer(new StringReader(filenameWithVariants))));
    PathFilenamePair pfPair = RegainToolkit.fragmentUrl(url);

    // Add the filename field for sorting
    doc.add(new Field("filename_sort", pfPair.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));

    // Add the document's size
    int size = rawDocument.getLength();
    doc.add(new Field("size", Integer.toString(size), Field.Store.YES, Field.Index.NOT_ANALYZED));

    // Add the mime-type
    String mimeType = rawDocument.getMimeType();
    doc.add(new Field("mimetype", mimeType, Field.Store.YES, Field.Index.NOT_ANALYZED));
   
    // Add last modified
    Date lastModified = rawDocument.getLastModified();
    if (lastModified == null) {
      // We don't know when the document was last modified
      // -> Take the current time
      lastModified = new Date();
    }
    doc.add(new Field("last-modified",
      DateTools.dateToString(lastModified, DateTools.Resolution.DAY), Field.Store.YES,
        Field.Index.NOT_ANALYZED));

    // Write the raw content to an analysis file
    writeContentAnalysisFile(rawDocument);
   
    // Add the additional fields
    if (additionalFieldMap != null) {
      Iterator iter = additionalFieldMap.keySet().iterator();
      while (iter.hasNext()) {
        String fieldName = (String) iter.next();
        String fieldValue = (String) additionalFieldMap.get(fieldName);
        //doc.add(new Field(fieldName, fieldValue, Field.Store.COMPRESS, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, fieldValue, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, CompressionTools.compressString(fieldValue), Field.Store.YES));
      }
    }

    if (hasContent(cleanedContent)) {
      // Write the clean content to an analysis file
      writeAnalysisFile(url, "clean", cleanedContent);

      // Add the cleaned content of the document
      doc.add(new Field("content", cleanedContent,
        this.storeContentForPreview ? Field.Store.YES : Field.Store.NO, Field.Index.ANALYZED));
    } else {
      // We have no content! This is a substitute document
      // -> Add a "preparation-error"-field
      doc.add(new Field("preparation-error", "true", Field.Store.YES,
          Field.Index.NO));
    }

    // Check whether to use the link text as title
    for (int i = 0; i < mUseLinkTextAsTitleReArr.length; i++) {
      if (mUseLinkTextAsTitleReArr[i].match(url)) {
        String linkText = rawDocument.getSourceLinkText();
        if (linkText != null) {
          title = linkText;
        }
        break;
      }
    }

    // Add the document's title
    if (hasContent(title)) {
      doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
      doc.add(new Field("title_sort", title.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
    } else {
      doc.add(new Field("title_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }

    // Add the document's summary
    if (! hasContent(summary) && hasContent(cleanedContent)) {
      summary = createSummaryFromContent(cleanedContent);
    }
    if (hasContent(summary)) {
      doc.add(new Field("summary", summary, Field.Store.NO, Field.Index.ANALYZED));
      doc.add(new Field("summary", CompressionTools.compressString(summary), Field.Store.YES));
    }

   // Add the document's metadata
    if (hasContent(metadata)) {
      doc.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED));
    }

    // Add the document's headlines
    if (hasContent(headlines)) {
      doc.add(new Field("headlines", headlines, Field.Store.NO,
          Field.Index.ANALYZED));
    }

    // Add the document's path
    if (pfPair.getPath() != null) {
      //String asString = pathToString(path);
      doc.add(new Field("path", pfPair.getPath(), Field.Store.YES, Field.Index.NO));
      doc.add(new Field("path_sort", pfPair.getPath().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));

      // Write the path to an analysis file
      writeAnalysisFile(url, "path", pfPair.getPath());
    } else {
      doc.add(new Field("path_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }

    return doc;
  }
View Full Code Here

 
  public Document[] createDataTwo(){
    ArrayList<Document> dataList=new ArrayList<Document>();
      String color = "red";
      String ID = Integer.toString(10);
      Document d=new Document();
      d.add(new Field("id",ID,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new Field("color",color,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new NumericField("NUM").setIntValue(10));
      dataList.add(d);
     
       color = "green";
       ID = Integer.toString(11);
       d=new Document();
      d.add(new Field("id",ID,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new Field("color",color,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new NumericField("NUM").setIntValue(11));
      dataList.add(d);
     
   
    return dataList.toArray(new Document[dataList.size()]);
}
View Full Code Here

      ArrayList<Document> dataList=new ArrayList<Document>();
      for(int i=0; i<_documentSize; i++)
      {
        String color = (i%2 == 0) ? "red" : "green";
        String ID = Integer.toString(i);
        Document d=new Document();
        d.add(new Field("id",ID,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
        d.add(new Field("color",color,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
        dataList.add(d);
      }
     
      return dataList.toArray(new Document[dataList.size()]);
  }
View Full Code Here

    throws RegainException
  {
    boolean shouldHighlight = results.getShouldHighlight(hitIndex);
   
    try {
      Document hit = results.getHitDocument(hitIndex);
      if (shouldHighlight) {
        results.highlightHitDocument(hitIndex);
      } else {
        results.shortenSummary(hitIndex);
      }
      request.setContextAttribute(ATTR_CURRENT_HIT, hit);
      float score = results.getHitScore(hitIndex);
      request.setContextAttribute(ATTR_CURRENT_HIT_SCORE, new Float(score));
      request.setContextAttribute(ATTR_CURRENT_HIT_INDEX, new Integer(hitIndex));

      String order = request.getParameter("order");
      //System.out.println("order: " + order);
      if (!(order == null || order.length() == 0 || order.startsWith(SortingOption.RELEVANCE))) {
        String fieldName = order.substring(0, order.lastIndexOf("_"));
        //System.out.println("none standard order. fieldname: " + fieldName);
        Field field = hit.getField(fieldName);
        String fieldContent = null;
        if (field != null) {
          fieldContent = field.stringValue();
        }
        if (fieldContent == null) {
View Full Code Here

TOP

Related Classes of org.apache.lucene.document.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.