Examples of RawDocument


Examples of com.dawidweiss.carrot.core.local.clustering.RawDocument

      List rawDocuments = this.rawCluster.getDocuments();
      documents = new HitDetails[ rawDocuments.size() ];
     
      int j = 0;
      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
        RawDocument doc = (RawDocument) i.next();
        Integer offset = (Integer) doc.getId();
        documents[j] = this.hits[offset.intValue()];
      }
    }

    return documents;
View Full Code Here

Examples of com.dawidweiss.carrot.core.local.clustering.RawDocument

      List rawDocuments = this.rawCluster.getDocuments();
      documents = new HitDetails[ rawDocuments.size() ];
     
      int j = 0;
      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
        RawDocument doc = (RawDocument) i.next();
        Integer offset = (Integer) doc.getId();
        documents[j] = this.hits[offset.intValue()];
      }
    }

    return documents;
View Full Code Here

Examples of com.dawidweiss.carrot.core.local.clustering.RawDocument

      List rawDocuments = this.rawCluster.getDocuments();
      documents = new HitDetails[ rawDocuments.size() ];
     
      int j = 0;
      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
        RawDocument doc = (RawDocument) i.next();
        Integer offset = (Integer) doc.getId();
        documents[j] = this.hits[offset.intValue()];
      }
    }

    return documents;
View Full Code Here

Examples of com.dawidweiss.carrot.core.local.clustering.RawDocument

      List rawDocuments = this.rawCluster.getDocuments();
      documents = new HitDetails[ rawDocuments.size() ];
     
      int j = 0;
      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
        RawDocument doc = (RawDocument) i.next();
        Integer offset = (Integer) doc.getId();
        documents[j] = this.hits[offset.intValue()];
      }
    }

    return documents;
View Full Code Here

Examples of net.sf.okapi.common.resource.RawDocument

        List<TextFlow> resources = document.getTextFlows();
        Map<String, HasContents> addedResources =
                new HashMap<String, HasContents>();

        RawDocument rawDoc =
                new RawDocument(documentContent, "UTF-8",
                        net.sf.okapi.common.LocaleId.fromString("en"));
        updateParams(filterParams);
        try {
            filter.open(rawDoc);
            String subDocName = "";
View Full Code Here

Examples of net.sf.okapi.common.resource.RawDocument

        if (localeId == null || localeId.isEmpty()) {
            throw new IllegalArgumentException(
                    "locale id string cannot be null or empty");
        }

        RawDocument rawDoc =
                new RawDocument(fileUri, "UTF-8",
                        net.sf.okapi.common.LocaleId.fromString("en"));
        return parseTranslationFile(rawDoc, filterParams);
    }
View Full Code Here

Examples of net.sf.okapi.common.resource.RawDocument

    private void generateTranslatedFile(URI originalFile,
            Map<String, TextFlowTarget> translations,
            net.sf.okapi.common.LocaleId localeId, IFilterWriter writer,
            Optional<String> params) {
        RawDocument rawDoc =
                new RawDocument(originalFile, "UTF-8",
                        net.sf.okapi.common.LocaleId.fromString("en"));
        updateParams(params);
        try {
            filter.open(rawDoc);
            String subDocName = "";
View Full Code Here

Examples of net.sf.regain.crawler.document.RawDocument

    for (int i = 0; i < docFileArr.length; i++) {
      if (docFileArr[i].isFile()) {
        String url = RegainToolkit.fileToUrl(docFileArr[i]);
        mLog.info("Preparing document: " + url);
        try {
          RawDocument doc = new RawDocument(url, sourceUrl, null, null);

          profiler.startMeasuring();
          String content;
          try {
            prep.prepare(doc);
View Full Code Here

Examples of net.sf.regain.crawler.document.RawDocument

          continue;
        }
      }

      // Create a raw document
      RawDocument rawDocument;
      try {
        rawDocument = new RawDocument(url, mCurrentJob.getSourceUrl(),
          mCurrentJob.getSourceLinkText(),
          CrawlerToolkit.findAuthenticationValuesForURL(url, accountPasswordStore));
       
      } catch (RedirectException exc) {
        String redirectUrl = exc.getRedirectUrl();
        mLog.info("Redirect '" + url +  "' -> '" + redirectUrl + "'");
        mUrlChecker.setIgnored(url);
        // the RedirectURL inherit the properties for shouldBeParsed, shouldBeIndexed from the
        // sourceURL. This is possibly not right according to definitions in the whitelist
        addJob(redirectUrl, mCurrentJob.getSourceUrl(), shouldBeParsed,
               shouldBeIndexed, mCurrentJob.getSourceLinkText());
        mCrawlerJobProfiler.stopMeasuring(0);
        continue;
      }
      catch (RegainException exc) {
        // Check whether the exception was caused by a dead link
        handleDocumentLoadingException(exc, mCurrentJob);

        // This document does not exist -> We can't parse or index anything
        // -> continue
        mCrawlerJobProfiler.abortMeasuring();
        continue;
      }

      if( shouldBeIndexed || shouldBeParsed ){
        if (mLog.isDebugEnabled()) {
          mLog.debug("Parsing and indexing " + rawDocument.getUrl());
        }
        mHtmlParsingProfiler.startMeasuring();

        // Parse and index content and metadata
        if (shouldBeIndexed) {
           try {
            mIndexWriterManager.addToIndex(rawDocument, this);
          }
          catch (RegainException exc) {
            logError("Indexing failed for: " + rawDocument.getUrl(), exc, false);
          }
        }

        // Extract links form the document (parse=true). The real meaning of parse in this context
        // is link-extraction. The document is parsed anyway (building a html-node tree).
        if (shouldBeParsed) {
          if(!shouldBeIndexed){
            // The document is not parsed so parse it
            mIndexWriterManager.getDocumentFactory().createDocument(rawDocument, this);
          }
          try {
            //parseHtmlDocument(rawDocument);
            createCrawlerJobs(rawDocument);
          }
          catch (RegainException exc) {
            logError("CrawlerJob creation failed for: " + rawDocument.getUrl(), exc, false);
          }
        }
        mHtmlParsingProfiler.stopMeasuring(rawDocument.getLength());
      }
      // System-Ressourcen des RawDocument wieder frei geben.
      rawDocument.dispose();

      // Zeitmessung stoppen
      mCrawlerJobProfiler.stopMeasuring(rawDocument.getLength());
      mCurrentJob = null;
     
      // Check whether to create a breakpoint
      int breakpointInterval = mConfiguration.getBreakpointInterval();
      boolean breakpointIntervalIsOver = (breakpointInterval > 0)
View Full Code Here

Examples of org.carrot2.core.clustering.RawDocument

      List rawDocuments = this.rawCluster.getDocuments();
      documents = new HitDetails[ rawDocuments.size() ];
     
      int j = 0;
      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
        RawDocument doc = (RawDocument) i.next();
        Integer offset = (Integer) doc.getId();
        documents[j] = this.hits[offset.intValue()];
      }
    }

    return documents;
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.