Examples of ITextExtractor


Examples of com.ikanow.infinit.e.data_model.interfaces.harvest.ITextExtractor

        for (String customText: customTextArray) {
          if (!customExtractors.containsKey(customText)) {
            // (else already have this extractor)
            try {
              Class customTextExtractor = customExtractorClassLoader.loadClass(customText);
              ITextExtractor obj = (ITextExtractor)customTextExtractor.newInstance();
              text_extractor_mappings.put(obj.getName().toLowerCase(), obj);
              customExtractors.put(customText, customTextExtractor);
            }
            catch (Exception e) {
              logger.error("ITextExtractor: Couldn't load " + customText +": " + e.getMessage(), e);
            }
            catch(NoClassDefFoundError e) {
              logger.error("ITextExtractor: Couldn't load " + customText +": " + e.getMessage(), e);
            }       
          }       
          else { // Already loaded, put in again
            try {
              Class customTextExtractor = customExtractors.get(customText)
              ITextExtractor obj = (ITextExtractor)customTextExtractor.newInstance();
              text_extractor_mappings.put(obj.getName().toLowerCase(), obj);           
            }
            catch (Exception e) {
              logger.error("ITextExtractor: Couldn't use already loaded " + customText +": " + e.getMessage(), e);
            }
            catch(NoClassDefFoundError e) {
              logger.error("ITextExtractor: Couldn't use already loaded " + customText +": " + e.getMessage(), e);
            }       
          }
        }
      }//TESTED
      // Entity extractors
      String customEntityList = props.getCustomEntityExtractors();
      if (null != customEntityList) {
        String customEntityArray[] = customEntityList.split("\\s*,\\s*");
        for (String customEntity: customEntityArray) {
          if (!customExtractors.containsKey(customEntity)) {
            // (else already have this extractor - but may have it for text, so some work to do)
            try {
              Class customEntityExtractor = customExtractorClassLoader.loadClass(customEntity);
              IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance();
              entity_extractor_mappings.put(obj.getName().toLowerCase(), obj);
              customExtractors.put(customEntity, customEntityExtractor);
            }
            catch (Exception e) {
              logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
            }
            catch(NoClassDefFoundError e) {
              logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
            }       
          }
          else { // If this object exists and if it's a text extractor, then see if it's also an entity extractor
            try {
              Class customEntityExtractor = customExtractors.get(customEntity);           
              IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance();
              entity_extractor_mappings.put(obj.getName(), obj);
            }
            catch (Exception e) {
              logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity +": " + e.getMessage(), e);           
            }
            catch(NoClassDefFoundError e) {
View Full Code Here

Examples of com.ikanow.infinit.e.data_model.interfaces.harvest.ITextExtractor

      // A teeny bit of complex logic:
      // toAdd by default use a text extractor
      // DB/Files by default don't (but can override)

      ITextExtractor currentTextExtractor = null;
      boolean bUseRawContentWhereAvailable = false; // (only applies for feeds)
      if (null != source.useTextExtractor()) {
        currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase());
        if (null == currentTextExtractor) { // (second chance)
          currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true);
        }
      }
      if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error)       
        if (null != source.useTextExtractor()) {                   

          if ((null == source.getStructuredAnalysisConfig()) && (null == source.getUnstructuredAnalysisConfig())
              && (null == source.getProcessingPipeline()))
          {
            //(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline)

            StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_txt_extractor=").append(source.useTextExtractor());
            logger.warn(errMsg.toString());

            // No point trying this for the rest of the day
            throw new ExtractorSourceLevelException(errMsg.toString());
          }
          else {
            bUseRawContentWhereAvailable = true; // (only checked for feeds)           
          }//TESTED
        }
        else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText)
        {
          if (null != currentEntityExtractor) {
            String selfExtraction = currentEntityExtractor.getCapability(EntityExtractorEnum.URLTextExtraction);
            // Leave as null unless have no built-in capability
            if ((null == selfExtraction) || !selfExtraction.equals("true"))
            {
              currentTextExtractor = default_text_extractor;
            }
          }
          else {
            currentTextExtractor = default_text_extractor;           
          }
        }//TESTED   
      }

      // EXTRACTION
      Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be
      // removed within the loop
      while ( i.hasNext() )
      {
        long nTime_ms = System.currentTimeMillis();
        DocumentPojo doc = i.next();
        boolean bExtractedText = false;

        // If I've been stopped then just remove all remaining documents
        // (pick them up next time through)
        if (bIsKilled) {
          i.remove();
          if (!calledFromPipeline) {
            doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
          }
          continue;
        }

        if ( calledFromPipeline || !urlsThatError.contains(doc.getUrl()) ) //only attempt if url is okay
        {       
          feed_count++;

          try {
            // (Check for truncation)
            if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
              try {
                String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
                if (null != s) {
                  int maxLength = Integer.parseInt(s);
                  if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
                    getHarvestStatus().logMessage("Warning: truncating document to max length: " + s, false);
                  }
                }
              }
              catch (Exception e) {} // max length not reported just carry on
            }
           
            if (null != currentTextExtractor)
            { 
              bExtractedText = true;
              currentTextExtractor.extractText(doc);
              if (null != currentEntityExtractor) {
                currentEntityExtractor.extractEntities(doc);
              }

            }//TESTED
View Full Code Here

Examples of com.ikanow.infinit.e.data_model.interfaces.harvest.ITextExtractor

          classToLoad = Class.forName(extractorInfo.getTitle(), true, child);
          dynamicExtractorClassCache.put(extractorInfo.getTitle(), classToLoad);
        }

        if (bTextExtractor) {
          ITextExtractor txtExtractor = (ITextExtractor )classToLoad.newInstance();
          text_extractor_mappings.put(source.useTextExtractor(), txtExtractor);
          outClassInstance = txtExtractor;
        }
        else {
          IEntityExtractor entExtractor = (IEntityExtractor)classToLoad.newInstance();
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.