Package com.ikanow.infinit.e.data_model.store.config.source

Examples of com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo


  StructuredAnalysisConfigPojo _pipelineTmpConfig = null;
 
  public void setEntities(DocumentPojo doc, List<EntitySpecPojo> entSpecs) throws JSONException, ScriptException {
    intializeDocIfNeeded(doc, _gson);
    if (null == _pipelineTmpConfig) {
      _pipelineTmpConfig = new StructuredAnalysisConfigPojo();
    }
    _pipelineTmpConfig.setEntities(entSpecs);
    expandIterationLoops(_pipelineTmpConfig);
    List<EntityPojo> ents = getEntities(_pipelineTmpConfig.getEntities(), doc);
    if (null == doc.getEntities()) { // (else has already been added by getEntities)
View Full Code Here


   
    //TODO (INF-1922): Allow setting of directed sentiment (here and in legacy code)
   
    intializeDocIfNeeded(doc, _gson);
    if (null == _pipelineTmpConfig) {
      _pipelineTmpConfig = new StructuredAnalysisConfigPojo();
    }
    _pipelineTmpConfig.setAssociations(assocSpecs);
    expandIterationLoops(_pipelineTmpConfig);
    List<AssociationPojo> assocs = getAssociations(_pipelineTmpConfig.getAssociations(), doc);
    if (null == doc.getAssociations()) { // (else has already been added by getAssociations)
View Full Code Here

    Gson g = _gson;
   
    // Skip if the StructuredAnalysis object of the source is null
    if (source.getStructuredAnalysisConfig() != null)
    {
      StructuredAnalysisConfigPojo s = source.getStructuredAnalysisConfig();     
      // (some pre-processing to expand the specs)
      expandIterationLoops(s)
     
      // Instantiate a new ScriptEngineManager and create an engine to execute 
      // the type of script specified in StructuredAnalysisPojo.scriptEngine
      this.intializeScriptEngine();     
           
      this.loadLookupCaches(s.getCaches(), source.getCommunityIds());
     
      // Iterate over each doc in docs, create entity and association pojo objects
      // to add to the feed using the source entity and association spec pojos
      Iterator<DocumentPojo> it = docs.iterator();
      int nDocs = 0;
      while (it.hasNext())
      {
        DocumentPojo f = it.next();
        nDocs++;
        try
        {
          resetEntityCache();
          _document = null;
          _docPojo = null;
            // (don't create this until needed, since it might need to be (re)serialized after a call
            //  to the UAH which would obviously be undesirable)
                                 
          // If the script engine has been instantiated pass the feed document and any scripts
          if (_scriptEngine != null)
          {
            List<String> scriptList = null;
            List<String> scriptFileList = null;
            try {
              // Script code embedded in source
              scriptList = Arrays.asList(s.getScript());
            }
            catch (Exception e) {}
            try {
              // scriptFiles - can contain String[] of script files to import into the engine
              scriptFileList = Arrays.asList(s.getScriptFiles());             
            }
            catch (Exception e) {}             
            this.loadGlobalFunctions(scriptFileList, scriptList, s.getScriptEngine());           
          }//TESTED
         
      // 1. Document level fields
         
          // Extract Title if applicable
          boolean bTryTitleLater = false;
          try {
            if (s.getTitle() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getTitle()))
              {
                f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
              }
              else
              {
                f.setTitle(getFormattedTextFromField(s.getTitle(), null));
              }
              if (null == f.getTitle()) {
                bTryTitleLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("title: " + e.getMessage(), e);
          }

          // Extract Display URL if applicable
          boolean bTryDisplayUrlLater = false;
          try {
            if (s.getDisplayUrl() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
              {
                f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
              }
              else
              {
                f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
              }
              if (null == f.getDisplayUrl()) {
                bTryDisplayUrlLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("displayUrl: " + e.getMessage(), e);
          }
          //TOTEST

          // Extract Description if applicable
          boolean bTryDescriptionLater = false;
          try {
            if (s.getDescription() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getDescription()))
              {
                f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
              }
              else
              {
                f.setDescription(getFormattedTextFromField(s.getDescription(), null));
              }
              if (null == f.getDescription()) {
                bTryDescriptionLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("description: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("description: " + e.getMessage(), e);
          }
         

          // Extract fullText if applicable
          boolean bTryFullTextLater = false;
          try {
            if (s.getFullText() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getFullText()))
              {
                f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
              }
              else
              {
                f.setFullText(getFormattedTextFromField(s.getFullText(), null));
              }
              if (null == f.getFullText()) {
                bTryFullTextLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("fullText: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("fullText: " + e.getMessage(), e);
          }
 
          // Published date is done after the UAH
          // (since the UAH can't access it, and it might be populated via the UAH)
         
      // 2. UAH/extraction properties
         
          // Add fields to metadata that can be used to create entities and associations
          // (Either with the UAH, or with the entity extractor)
          try {
            boolean bMetadataChanged = false;
            if (null != this._unstructuredHandler)
            {
              try
              {
                this._unstructuredHandler.set_sahEngine(_scriptEngine);
                bMetadataChanged = this._unstructuredHandler.executeHarvest(_context, source, f, (1 == nDocs), it.hasNext());
              }
              catch (Exception e) {
                contextController.handleExtractError(e, source); //handle extractor error if need be   
               
                it.remove(); // remove the document from the list...
                f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
               
                // (Note: this can't be source level error, so carry on harvesting - unlike below)
                continue;
              }
            } 
            if (contextController.isEntityExtractionRequired(source))
            {
              bMetadataChanged = true;
             
              // Text/Entity Extraction
              List<DocumentPojo> toAdd = new ArrayList<DocumentPojo>(1);
              toAdd.add(f);
              try {
                contextController.extractTextAndEntities(toAdd, source, false, false);
                if (toAdd.isEmpty()) { // this failed...
                  it.remove(); // remove the document from the list...
                  f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                  continue;
                }//TESTED
              }
              catch (Exception e) {
                contextController.handleExtractError(e, source); //handle extractor error if need be       
                it.remove(); // remove the document from the list...
                f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
               
                if (source.isHarvestBadSource())
                {
                  // Source error, ignore all other documents
                  while (it.hasNext()) {
                    f = it.next();
                    f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                    it.remove();
                  }
                  break;
                }
                else {
                  continue;
                }
                //TESTED
              }
            }
            if (bMetadataChanged) {
              // Ugly, but need to re-create doc json because metadata has changed
              String sTmpFullText = f.getFullText();
              f.setFullText(null); // (no need to serialize this, can save some cycles)
              _document = null;
              _docPojo = null;
              intializeDocIfNeeded(f, g);             
                  f.setFullText(sTmpFullText); //(restore)
            }
           
            // Can copy metadata from old documents to new ones:           
            handleDocumentUpdates(s.getOnUpdateScript(), f);
           
            // Check (based on the metadata and entities so far) whether to retain the doc
            if (rejectDoc(s.getRejectDocCriteria(), f)) {
              it.remove(); // remove the document from the list...
              f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
              continue;                             
            }
          }
          catch (Exception e) {
            this._context.getHarvestStatus().logMessage("SAH->UAH: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("SAH->UAH: " + e.getMessage(), e);
          }
           
          // Now create document since there's no risk of having to re-serialize
          intializeDocIfNeeded(f, g);
         
      // 3. final doc-level metadata fields:
         
          // If description was null before might need to get it from a UAH field
          if (bTryTitleLater) {
            try {
              if (s.getTitle() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getTitle()))
                {
                  f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
                }
                else
                {
                  f.setTitle(getFormattedTextFromField(s.getTitle(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("title: " + e.getMessage(), e);
            }
          }
         
          // Extract Display URL if needed
          if (bTryDisplayUrlLater) {
            try {
              if (s.getDisplayUrl() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
                {
                  f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
                }
                else
                {
                  f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("displayUrl: " + e.getMessage(), e);
            }
          }         
          //TOTEST
         
          // If description was null before might need to get it from a UAH field
          if (bTryDescriptionLater) {
            try {
              if (s.getDescription() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getDescription()))
                {
                  f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
                }
                else
                {
                  f.setDescription(getFormattedTextFromField(s.getDescription(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("description2: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("description2: " + e.getMessage(), e);
            }           
          }
         
          // If fullText was null before might need to get it from a UAH field
          if (bTryFullTextLater) {
            try {
              if (s.getFullText() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getFullText()))
                {
                  f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
                }
                else
                {
                  f.setFullText(getFormattedTextFromField(s.getFullText(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("fullText2: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("fullText2: " + e.getMessage(), e);
            }           
          }
         
          // Extract Published Date if applicable
          if (s.getPublishedDate() != null)
          {
            if (JavaScriptUtils.containsScript(s.getPublishedDate()))
            {
              try
              {
                f.setPublishedDate(new Date(
                    DateUtility.parseDate((String)getValueFromScript(s.getPublishedDate(), null, null))));
              }
              catch (Exception e)
              {
                this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);           
              }
            }
            else
            {
              try
              {
                f.setPublishedDate(new Date(
                    DateUtility.parseDate((String)getFormattedTextFromField(s.getPublishedDate(), null))));
              }
              catch (Exception e)
              {
                this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);           
              }
            }
          }
         
      // 4. Entity level fields   
         
          // Extract Document GEO if applicable
         
          if (s.getDocumentGeo() != null)
          {
            try
            {
              f.setDocGeo(getDocGeo(s.getDocumentGeo()));
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("docGeo: " + e.getMessage(), true);           
            }
          }

          // Extract Entities
          if (s.getEntities() != null)
          {
            f.setEntities(getEntities(s.getEntities(), f));
          }

          // Extract Associations
          if (s.getAssociations() != null)
          {
            f.setAssociations(getAssociations(s.getAssociations(), f));
          }
         
      // 5. Remove unwanted metadata fields
         
          removeUnwantedMetadataFields(s.getMetadataFields(), f);         
        }
        catch (Exception e)
        {
          this._context.getHarvestStatus().logMessage("Unknown: " + e.getMessage(), true);           
          //DEBUG (don't output log messages per doc)
View Full Code Here

 
  public static void main(String[] argv) {
   
    // Test entity expansion:
   
    StructuredAnalysisConfigPojo s = new StructuredAnalysisConfigPojo();
    s.setEntities(new ArrayList<EntitySpecPojo>(20));
    EntitySpecPojo e = null;
    e = new EntitySpecPojo();
    //a1
    e.setIterateOver("a");
    e.setDisambiguated_name("a.test1");
    s.getEntities().add(e);
    //a2
    e = new EntitySpecPojo();
    e.setIterateOver("a");
    e.setDisambiguated_name("a.test2");
    s.getEntities().add(e);
    //x1
    e = new EntitySpecPojo();
    e.setIterateOver("x");
    e.setDisambiguated_name("x.test1");
    s.getEntities().add(e);
    //a.b1
    e = new EntitySpecPojo();
    e.setIterateOver("a.b");
    e.setDisambiguated_name("a.b.test1");
    s.getEntities().add(e);
    //a.b.c.d1
    e = new EntitySpecPojo();
    e.setIterateOver("a.b.c.d");
    e.setDisambiguated_name("a.b.c.d.test1");
    s.getEntities().add(e);
    //a.b2
    e = new EntitySpecPojo();
    e.setIterateOver("a.b");
    e.setDisambiguated_name("a.b.test2");
    s.getEntities().add(e);
    //p.q1
    e = new EntitySpecPojo();
    e.setIterateOver("p.q");
    e.setDisambiguated_name("p.q.test1");
    s.getEntities().add(e);
    // null case
    e = new EntitySpecPojo();
    e.setDisambiguated_name("(null iterator)");
    s.getEntities().add(e);
   
    expandIterationLoops(s);
   
    System.out.println("TEST1: ENTITY ITERATION EXPANSION: ");
    System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(s));
   
    s.setAssociations(new ArrayList<AssociationSpecPojo>(20));
    AssociationSpecPojo assoc = null;
    assoc = new AssociationSpecPojo();
    //a1
    assoc.setIterateOver("a");
    assoc.setEntity1("a.test1");
    s.getAssociations().add(assoc);
    //a2
    assoc = new AssociationSpecPojo();
    assoc.setIterateOver("a");
    assoc.setEntity1("a.test2");
    s.getAssociations().add(assoc);
    //x1
    assoc = new AssociationSpecPojo();
    assoc.setIterateOver("x");
    assoc.setEntity1("x.test1");
    s.getAssociations().add(assoc);
    //a.b1
    assoc = new AssociationSpecPojo();
    assoc.setIterateOver("a.b");
    assoc.setEntity1("a.b.test1");
    s.getAssociations().add(assoc);
    //a.b.c.d1
    assoc =new AssociationSpecPojo();
    assoc.setIterateOver("a.b.c.d");
    assoc.setEntity1("a.b.c.d.test1");
    s.getAssociations().add(assoc);
    //a.b2
    assoc =new AssociationSpecPojo();
    assoc.setIterateOver("a.b");
    assoc.setEntity1("a.b.test2");
    s.getAssociations().add(assoc);
    //p.q1
    assoc =new AssociationSpecPojo();
    assoc.setIterateOver("p.q");
    assoc.setEntity1("p.q.test1");
    s.getAssociations().add(assoc);
    //"," case
    assoc =new AssociationSpecPojo();
    assoc.setIterateOver("p.q,RR");
    assoc.setEntity1("ITERATE OVER p.q,RR");
    s.getAssociations().add(assoc);
    //"/" case
    assoc =new AssociationSpecPojo();
    assoc.setIterateOver("p.q/SS");
    assoc.setEntity1("ITERATE OVER p.q/SS");
    s.getAssociations().add(assoc);
    // null case
    assoc =new AssociationSpecPojo();
    assoc.setEntity1("(null iterator)");
    s.getAssociations().add(assoc);
   
    //SHOULD HAVE TEST FOR ITERATE OVER p,q (now hand tested anyway)
   
    expandIterationLoops(s);
   
View Full Code Here

      }
      // (saved fields to rewrite later if not public, do up here for visibility)
      String url = source.getUrl();
      SourceRssConfigPojo rss = source.getRssConfig();
      List<SourcePipelinePojo> pxPipe = source.getProcessingPipeline();
      StructuredAnalysisConfigPojo sah = source.getStructuredAnalysisConfig();
      UnstructuredAnalysisConfigPojo uah = source.getUnstructuredAnalysisConfig();
     
      if (!bIsPublic) { // Cleanse URLs, remove processing pipeline information
        source.setPartiallyPublished(true); //TESTED
       
        // Copy URL info from px pipeline into the main source
        if ((null != source.getProcessingPipeline()) && !source.getProcessingPipeline().isEmpty()) {         
          SourcePipelinePojo firstEl = source.getProcessingPipeline().iterator().next();
          if (null != firstEl.web) {
            source.setRssConfig(firstEl.web);
          }
          else if (null != firstEl.feed) {
            source.setRssConfig(firstEl.feed);
          }
          else if (null != firstEl.database) {
            source.setUrl(firstEl.database.getUrl());
          }
          else if (null != firstEl.file) {
            source.setUrl(firstEl.file.getUrl());
          }
          source.setProcessingPipeline(new ArrayList<SourcePipelinePojo>()); // (delete px pipeline)
        }//(end if non-empty px pipeline)
        //TESTED
       
        int nIndex = -1;
        if ((null != url) && ((nIndex = url.indexOf('?')) >= 0)) {
          source.setUrl(url.substring(0, 1 + nIndex));
        }
        if (null != rss) {
          rss.setHttpFields(null); // (remove cookie information)
        }
        if ((null != rss) && (null != rss.getExtraUrls())) {
          SourceRssConfigPojo newRss = new SourceRssConfigPojo();
          ArrayList<SourceRssConfigPojo.ExtraUrlPojo> newList = new ArrayList<SourceRssConfigPojo.ExtraUrlPojo>(rss.getExtraUrls().size());
          for (SourceRssConfigPojo.ExtraUrlPojo urlObj: rss.getExtraUrls()) {
            SourceRssConfigPojo.ExtraUrlPojo newUrlObj = new SourceRssConfigPojo.ExtraUrlPojo();
            if ((null != urlObj.url) && ((nIndex = urlObj.url.indexOf('?')) >= 0)) {
              newUrlObj.url = urlObj.url.substring(0, 1 + nIndex);
            }
            else {
              newUrlObj.url = urlObj.url;
            }
            newUrlObj.title = urlObj.title;
            newUrlObj.description = urlObj.description;
            newUrlObj.publishedDate = urlObj.publishedDate;
            newUrlObj.fullText = urlObj.fullText;
            newList.add(newUrlObj);
          }
          newRss.setExtraUrls(newList);
          source.setRssConfig(newRss);
        }
        else if (null != rss) {
          source.setRssConfig(null);         
        }
        if (null != source.getStructuredAnalysisConfig()) {
          source.setStructuredAnalysisConfig(new StructuredAnalysisConfigPojo());
        }//TESTED
        if (null != source.getUnstructuredAnalysisConfig()) {
          source.setUnstructuredAnalysisConfig(new UnstructuredAnalysisConfigPojo());
        }//TESTED        
      }
View Full Code Here

TOP

Related Classes of com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.