Source Code of com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.harvest.extraction.document.rss;


import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;


import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.jdom.Element;
import org.jdom.output.XMLOutputter;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.XML;


import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelTransientException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo.ExtraUrlPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
import com.ikanow.infinit.e.data_model.store.social.authentication.AuthenticationPojo;
import com.ikanow.infinit.e.harvest.HarvestContext;
import com.ikanow.infinit.e.harvest.extraction.document.HarvesterInterface;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager;
import com.ikanow.infinit.e.harvest.extraction.document.file.XmlToMetadataParser;
import com.ikanow.infinit.e.harvest.utils.DateUtility;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.ikanow.infinit.e.harvest.utils.TextEncryption;
import com.sun.syndication.feed.module.georss.GeoRSSModule;
import com.sun.syndication.feed.module.georss.GeoRSSUtils;
import com.sun.syndication.feed.module.georss.geometries.AbstractGeometry;
import com.sun.syndication.feed.module.georss.geometries.AbstractRing;
import com.sun.syndication.feed.module.georss.geometries.Envelope;
import com.sun.syndication.feed.module.georss.geometries.LineString;
import com.sun.syndication.feed.module.georss.geometries.LinearRing;
import com.sun.syndication.feed.module.georss.geometries.Polygon;
import com.sun.syndication.feed.synd.SyndContentImpl;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndEntryImpl;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.feed.synd.SyndFeedImpl;
import com.sun.syndication.fetcher.FeedFetcher;
import com.sun.syndication.fetcher.impl.FeedFetcherCache;
import com.sun.syndication.fetcher.impl.HashMapFeedInfoCache;
import com.sun.syndication.fetcher.impl.HttpClientFeedFetcher;
import com.sun.syndication.fetcher.impl.HttpURLFeedFetcher;


public class FeedHarvester implements HarvesterInterface
{


  // List of Feeds
  private List<DocumentPojo> docsToAdd = null;
  private List<DocumentPojo> docsToUpdate = null;
  @SuppressWarnings("unused")
  private List<DocumentPojo> docsToRemove = null;
  private Set<Integer> sourceTypesCanHarvest = new HashSet<Integer>();


  private HarvestContext _context;


  // Initialize the Logger
  private static final Logger logger = Logger.getLogger(FeedHarvester.class);


  // Parameters
  PropertiesManager props = new PropertiesManager();
  
  /**
   * Default Constructor, does nothing
   */
  public FeedHarvester()
  {      
    sourceTypesCanHarvest.add(InfiniteEnums.FEEDS);
  }


  @Override
  protected void finalize() throws Throwable
  {


  }


  // State across a harvest
  private int nTmpHttpErrors = 0;
  private int nTmpDocsSubmitted = 0;


  public void executeHarvest(HarvestContext context, SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove) {
    _context = context;
    this.docsToAdd = toAdd;
    this.docsToUpdate = toUpdate;
    this.docsToRemove = toRemove;
    
    // Fill in any blank user agents at the top level, for simplicity
    if (null != props.getHarvestUserAgent()) {
      String userAgent = props.getHarvestUserAgent();      
      if (null == source.getRssConfig()) {
        source.setRssConfig(new SourceRssConfigPojo());
      }
      if (null == source.getRssConfig().getUserAgent()) { // ...But rss.userAgent doesn't:
        source.getRssConfig().setUserAgent(userAgent); // then override it
      }
      if (null != source.getRssConfig().getSearchConfig()) { // If rss.searchConfig exists...
        if (null == source.getRssConfig().getSearchConfig().getUserAgent()) { // but rss.searchConfig.userAgent doesn't:
          source.getRssConfig().getSearchConfig().setUserAgent(userAgent);
        }
      }
    } // (end default user agent logic)
    
    try 
    {
      logger.debug("Source: " + source.getUrl());


      //compile feeds from source
      processFeed(source);


      logger.debug("Doc List Size: " + this.docsToAdd.size());      
      logger.debug("Doc Update Size: " + this.docsToUpdate.size());      
    } 
    catch (Exception e)
    {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
    }
  }


  // Set up the authentication credentials
  private AuthenticationCredentials authenticateFeed(AuthenticationPojo auth) {
    AuthenticationCredentials authenticationCredentials = null;
    // Added in the event that authentication is required for the feed
    if (auth != null) 
    {


      String decpword = new TextEncryption().decrypt(auth.getPassword());
      authenticationCredentials = new AuthenticationCredentials(auth.getUsername(), decpword);
    }


    return authenticationCredentials;
  }


  // Get the syndicated feed using rome
  private SyndFeed getFeed(SourcePojo source, String url) 
  {
    synchronized(FeedHarvester.class) { // (workaround for ROME concurrency issues: http://www.jdom.org/pipermail/jdom-interest/2008-December/016252.html)


      if (null == url) {
        url = source.getUrl();
      }
      for (int i = 0; i < 2; ++i) { // Will have 2 goes in case of failure
        // Check to see if the feed requires authentication
        if (source.getAuthentication() != null) //requires auth
        {
          try 
          {
            FeedFetcher feedFetcher = new HttpClientFeedFetcher(null, authenticateFeed(source.getAuthentication()));
            if ((null != source.getRssConfig()) && (null != source.getRssConfig().getUserAgent())) {
              feedFetcher.setUserAgent(source.getRssConfig().getUserAgent());
            }
            SyndFeed retVal = feedFetcher.retrieveFeed(new URL(this.cleanUrlStart(url)));
            if (null == retVal) {
              handleRssError(new RuntimeException("Unknown RSS error") , source);              
            }
            return retVal;
          } 
          catch (Exception e) {
            System.out.println(i + "  " + url);
            
            if (1 == i) { // else just try again
              handleRssError(e, source);
            }
          }
        }
        else //does not require auth
        {
          try 
          {
            FeedFetcherCache feedInfoCache = HashMapFeedInfoCache.getInstance();
            FeedFetcher feedFetcher = new HttpURLFeedFetcher(feedInfoCache);
            if ((null != source.getRssConfig()) && (null != source.getRssConfig().getUserAgent())) {
              feedFetcher.setUserAgent(source.getRssConfig().getUserAgent());
            }
            SyndFeed retVal = feedFetcher.retrieveFeed(new URL(this.cleanUrlStart(url)));
            if (null == retVal) {
              handleRssError(new RuntimeException("Unknown RSS error") , source);              
            }
            return retVal;
          } 
          catch (Exception e) 
          {
            if (1 == i) { // else just try again
              handleRssError(e, source);
            }
          }
        }
        
        // If still here, must have errored so sleep before trying again
        try { Thread.sleep(10000); } catch (InterruptedException e) {}
        
      } // (end get 2 goes)
    }
    return null;
  }


  // Utility for RSS extraction
  
  private void handleRssError(Exception e, SourcePojo source) {
    // Error handling:
    // - If it's a 500 or 502 or 503 or 504 then just log and carry on
    // - Otherwise, if you get the same message twice in succession then error out
    boolean bTempSuspendSource = false;
    boolean fullSuspendSource = false;
    String sNewMessage = e.getMessage();
    
    if (null != sNewMessage) {
      if (null != source.getHarvestStatus()) {
        
        String sOldMessage = source.getHarvestStatus().getHarvest_message();
        if (null != sOldMessage)  { 
          
          // Only consider suspending if we didn't any extract docs last time round:          
          if (!sOldMessage.contains("\nextracted=") || sOldMessage.contains("\nextracted=0")) {
            String[] oldMessageLines = sOldMessage.split("\n");
            String[] newMessageLines = sNewMessage.split("\n");
            
            // Strip the dates out from both, also errors that differ only by line numbers            
            sOldMessage = oldMessageLines[0].replaceAll("\\[[0-9T:-]+\\]", "").replaceAll(" line [0-9]+", "").trim();
            sNewMessage = newMessageLines[0].replaceAll("\\[[0-9T:-]+\\]", "").replaceAll(" line [0-9]+", "").trim();


            long now = new Date().getTime();


            if (sOldMessage.equals(sNewMessage)) {
              bTempSuspendSource = true;
              
              if (sNewMessage.matches(".*50[0234].*")) {
                // In theory this is a temp error, but it seems to be long lived also... if we haven't received anything
                // in the last day then temp suspend (also check for full suspension after 7days as above)
                if ((null == source.getHarvestStatus().getExtracted()) ||
                    ((now - source.getHarvestStatus().getExtracted().getTime()) > 1*24*3600L*1000L))
                {
                  bTempSuspendSource = true;
                }
              }
              else {
                bTempSuspendSource = true;                
              }              
              
              // Also, if haven't extracted anything for last 7 days then suspend "for good"
              if (null != source.getHarvestStatus().getHarvested()) {


                // Harvested within the last day (ie not _just_ turned back on again...)
                if ((now - source.getHarvestStatus().getHarvested().getTime()) < 24*3600L*1000L) {
                  // If it hasn't extracted anything within a week then turn it off properly
                  if ((null == source.getHarvestStatus().getExtracted()) ||
                      ((now - source.getHarvestStatus().getExtracted().getTime()) > 7*24*3600L*1000L))
                  {
                    fullSuspendSource = true;
                  }
                }
              }//TESTED
            }
          }//TESTED
        }
      }
    }//TESTED
    _context.getHarvestStatus().update(source, new Date(), HarvestEnum.error, sNewMessage, bTempSuspendSource, fullSuspendSource);
  }
  //TESTED (temp error ignore, allow 2 identical errors before failing)
  
  // Process the feed
  private void processFeed(SourcePojo source) throws Exception {
    // Process the feed
    LinkedList<SyndFeed> feeds = new LinkedList<SyndFeed>();
    boolean confirmedUrlsExtracted = false;
    
    if ((null != source.getUrl()) && ((null == source.getRssConfig())||(null == source.getRssConfig().getSearchConfig()))) {
      // (if the second clause is false, the URL is a search query, will process differently, inside buildFeedList)
      
      SyndFeed feed = getFeed(source, null);
      if (null != feed) {
        feeds.add(feed);
      }
    }
    else if ((null != source.getRssConfig())&&(null != source.getRssConfig().getSearchConfig()))
    {
      try {
        FeedHarvester_searchEngineSubsystem searchEngineSubsystem = new FeedHarvester_searchEngineSubsystem();
        searchEngineSubsystem.generateFeedFromSearch(source, _context, null);
        confirmedUrlsExtracted = true;
      }//TESTED
      catch (ExtractorDocumentLevelException e) {
        handleRssError(e, source);
        confirmedUrlsExtracted = true;        
      }//TESTED
      catch (ExtractorSourceLevelTransientException e) {
        handleRssError(e, source);
      }//TESTED
    }//TESTED
    
    if ((null != source.getRssConfig())&&(null != source.getRssConfig().getExtraUrls())&&(null == source.getRssConfig().getSearchConfig())) { 
      // Some of these might be RSS feeds, check if title==null
      for (ExtraUrlPojo url: source.getRssConfig().getExtraUrls()) {
        if ((null == url.title) && (null != url.url)) {
          SyndFeed feed = getFeed(source, url.url);
          if (null != feed) {
            feeds.add(feed);
          }          
        }
        else if (null != url.url){
          confirmedUrlsExtracted = true;
        }
      }
    }//TESTED


    if ( !feeds.isEmpty() || confirmedUrlsExtracted ) // (second case: also have extra URLs)
    {
      // Error handling, part 1:
      this.nTmpHttpErrors = 0;
      this.nTmpDocsSubmitted = 0;


      // Extract the feed and place into the pojo
      try {
        buildFeedList(feeds, source);
      }
      catch (Exception e) {
        // Propagate upwards:
        throw e;
      }
      
      // Error handling part 2:
      // clean up
      if ((nTmpHttpErrors == this.nTmpDocsSubmitted) && (this.nTmpDocsSubmitted > 5))
      { 
        // any time when all a decent number of feeds are errors


        logger.error("Source generates only invalid feeds: " + " http_errs=" + nTmpHttpErrors + " source=" + source.getUrl());


        if (this.nTmpDocsSubmitted < 20) {
          //harvested unsucessfully, post in mongo
          _context.getHarvestStatus().update(source, new Date(), HarvestEnum.error, "Extraction errors: redirect_errs=" + 
              "http_errs=" + nTmpHttpErrors, true, false);        
        }
        else {
          //harvested unsucessfully, post in mongo *AND DISABLE*
          _context.getHarvestStatus().update(source, new Date(), HarvestEnum.error, "Extraction errors: redirect_errs=" + 
              "http_errs=" + nTmpHttpErrors, true, true);        
        }
      }  
      else {
        //harvested successfully, post in mongo
        _context.getHarvestStatus().update(source, new Date(), HarvestEnum.in_progress, "", false, false);        
      }
    }
    // (if we're not in here it must be because an error has been logged)
  }


  // Build the feed list
  @SuppressWarnings("unchecked")
  private void buildFeedList(LinkedList<SyndFeed> syndFeeds, SourcePojo source) 
  {
    // If there's a max number of sources to get per harvest, configure that here:
    long nWaitTime_ms = props.getWebCrawlWaitTime();
    long nMaxTime_ms = props.getMaxTimePerSource(); // (can't override this, too easy to break the system...)
    
    int nMaxDocsPerSource = props.getMaxDocsPerSource();
    if (_context.isStandalone()) {
      nMaxDocsPerSource = _context.getStandaloneMaxDocs();
    }    
    long nNow = new Date().getTime();
    if (null != source.getRssConfig()) {
      if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
        nWaitTime_ms = source.getRssConfig().getWaitTimeOverride_ms();
      }
    }
    long nMaxDocs = Long.MAX_VALUE;
    if (nWaitTime_ms > 0) {
      nMaxDocs = 1 + nMaxTime_ms/nWaitTime_ms;
    }
    if (nMaxDocs > nMaxDocsPerSource) { // (another limit, take the smaller of the 2)
      nMaxDocs = nMaxDocsPerSource;
    }
    // Can override system settings if less:
    if ((null != source.getThrottleDocs()) && (source.getThrottleDocs() < nMaxDocs)) {
      nMaxDocs = source.getThrottleDocs();
    }
    // (end per feed configuration)
    
    
    // Add extra docs
    List<SyndEntry> tmpList = null;
    boolean bCreatedAggregateList = false;
    int nRealSyndEntries = 0;
    
    for (SyndFeed feed: syndFeeds) {
      if (0 == nRealSyndEntries) {
        tmpList = feed.getEntries();
      }
      else if (!bCreatedAggregateList) {
        bCreatedAggregateList = true;
        tmpList = new LinkedList<SyndEntry>(tmpList);
        tmpList.addAll(feed.getEntries());
      }
      else {
        tmpList.addAll(feed.getEntries());
      }
      nRealSyndEntries += feed.getEntries().size();
    }
    if (null == tmpList) {
      tmpList = new LinkedList<SyndEntry>();      
    }
    //TESTED
    
    if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls())) {
      for (ExtraUrlPojo extraUrl: source.getRssConfig().getExtraUrls()) {
        if (null == extraUrl.title) {
          continue; // (this is an RSS feed not a URL)
        }//TESTED
        SyndEntryImpl synd = new SyndEntryImpl();
        synd.setLink(extraUrl.url);
        if (null != extraUrl.description) {
          SyndContentImpl description = new SyndContentImpl();
          description.setValue(extraUrl.description);
          synd.setDescription(description);
        }
        synd.setTitle(extraUrl.title);
        if (null != extraUrl.publishedDate) {
          try {
            synd.setPublishedDate(new Date(DateUtility.parseDate(extraUrl.publishedDate)));            
          }
          catch (Exception e) {} // do nothign will use now as pub date
        }        
        tmpList.add((SyndEntry) synd);


        if (null != extraUrl.fullText) {
          SyndFeedImpl fullTextContainer = new SyndFeedImpl();
          fullTextContainer.setDescription(extraUrl.fullText);
          synd.setSource(fullTextContainer);
        }
      }
    }
    
    // Then begin looping over entries


    LinkedList<String> duplicateSources = new LinkedList<String>();     
    try {
      Map<String, List<SyndEntry>> urlDups = new HashMap<String, List<SyndEntry>>();
      int nSyndEntries = 0;
      for ( Object synd : tmpList )
      {        
        nSyndEntries++; // (keep count so we know we're accessing our own fake SyndEntryImpls)
        final SyndEntry entry = (SyndEntry)synd;
  
        if ( null != entry.getLink() ) //if url returns null, skip this entry
        {
          String url = entry.getLink();
          if ((nSyndEntries <= nRealSyndEntries) || (null == entry.getSource())) { // (else URL can be what it wants)
            url = this.cleanUrlStart(entry.getLink());
          }


          // Intra-source distribution logic:
          if ((null != source.getDistributionTokens()) && (null != source.getDistributionFactor())) {
            int split = Math.abs(url.hashCode()) % source.getDistributionFactor();
            if (!source.getDistributionTokens().contains(split)) {
              continue;
            }
          }//TESTED (copy and paste from FileHarvester)
          
          if (null != source.getRssConfig()) { // Some RSS specific logic
            // If an include is specified, must match
            Matcher includeMatcher = source.getRssConfig().getIncludeMatcher(url);
            if (null != includeMatcher) {
              if (!includeMatcher.find()) {
                continue;
              }
            }
            // If an exclude is specified, must not match
            Matcher excludeMatcher = source.getRssConfig().getExcludeMatcher(url);
            if (null != excludeMatcher) {
              if (excludeMatcher.find()) {
                continue;
              }
            }
          }


          // Some error checking:
          // sometimes the URL seems to have some characters in front of the HTTP - remove these
          this.nTmpDocsSubmitted++;
          if (null == url) {
            this.nTmpHttpErrors++;
            continue;
          }


          // Also save the title and description:
          String title = "";
          if (null != entry.getTitle()) {
            title = entry.getTitle();
          }
          String desc = "";
          if (null != entry.getDescription()) {
            desc = entry.getDescription().getValue();          
          }        
          boolean duplicate = false;


          // Look for duplicates within the current set of sources
          List<SyndEntry> possDups = null;
          if (null == (possDups = urlDups.get(url))) { // (new URL)
            possDups = new LinkedList<SyndEntry>();
            possDups.add(entry);
            urlDups.put(url, possDups);
          }
          else { // (old URL, check if this is a duplicate...)
            @SuppressWarnings("unused")
            int nCount = 0; // (for handy debugging)
            
            for (SyndEntry possDup : possDups) {
              if (possDup.getTitle().equals(title) || 
                  ((null != possDup.getDescription()) && possDup.getDescription().getValue().equals(desc)) ||
                  ((null != possDup.getDescription()) && (null == entry.getDescription())))
              {
                // If *either* the title or the description matches as well as the URL...
                duplicate = true;
                break;
              }
              nCount++;
            }
            
            if (!duplicate) {
              possDups.add(entry);            
            }
            else { // DUPLICATE: ensure we have minimal set of data to cover all cases: 
              boolean bTitleMatch = false;
              boolean bDescMatch = false;
              for (SyndEntry possDup : possDups) {
                if (!bTitleMatch && possDup.getTitle().equals(title)) { // (don't bother if already have a title match)
                  bTitleMatch = true;
                }
                else if (!bDescMatch) { // (don't yet have a desc match(
                  if (null != entry.getDescription()) {
                    if (null != possDup.getDescription()) { // (neither desc is null)
                      if (possDup.getDescription().getValue().equals(desc)) {
                        bDescMatch = true;
                      }
                    }
                  }
                  else { // curr desc is null
                    if (null == possDup.getDescription()) { // dup desc is null
                      bDescMatch = true;
                    }


                  } // (end various title match/desc match/both have no desc cases
                } // (end if no desc match)
                if (bTitleMatch && bDescMatch) {
                  break; // (no way can fire)
                }
              } // (end loop over dups)


              if (!bTitleMatch || !bDescMatch) {
                possDups.add(entry);                          
              }


            } // (end is duplicate, nasty logic to add minimal set to dup list to cover all titles, descs)
          }
          if (duplicate) {
            continue;
          }


          try {          
            DuplicateManager qr = _context.getDuplicateManager();
            if (null != entry.getDescription()) {
              duplicate = qr.isDuplicate_UrlTitleDescription(url, title.replaceAll("\\<.*?\\>", "").trim(), desc.replaceAll("\\<.*?\\>", "").trim(), source, duplicateSources);
            }
            else {
              duplicate = qr.isDuplicate_UrlTitleDescription(url, title.replaceAll("\\<.*?\\>", "").trim(), null, source, duplicateSources);            
              //^^^(this is different to isDuplicate_UrlTitle because it enforces that the description be null, vs just checking the title)
            }
            if (duplicate && (null != source.getRssConfig()) && (null != source.getRssConfig().getUpdateCycle_secs())) { 
              // Check modified times...
              Date dupModDate = qr.getLastDuplicateModifiedTime();
              ObjectId dupId = qr.getLastDuplicateId();
              
              if ((null != dupModDate) && (null != dupId)) {
                if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs()*1000 < nNow) {
                  
                  DocumentPojo doc = buildDocument(url, entry, source, duplicateSources);
                  if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                    // (Use dummy TitleEx to create a "fake" full text block)
                    doc.setFullText(entry.getSource().getDescription());
                  }
                  doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite) 
                  this.docsToUpdate.add(doc);
                  
                  if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                    source.setReachedMaxDocs();
                    break; // (that's enough documents)
                  }
                }
              }
            }//TESTED (duplicates we update instead of ignoring)
            
            if (!duplicate) {
              DocumentPojo doc = buildDocument(url, entry, source, duplicateSources);
              if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                // (Use dummy TitleEx to create a "fake" full text block)
                doc.setFullText(entry.getSource().getDescription());
              }
              this.docsToAdd.add(doc);


              if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                source.setReachedMaxDocs();
                break; // (that's enough documents)
              }
            }
            if (this.nTmpDocsSubmitted > 20) { // (some arbitrary "significant" number)
              if (nTmpHttpErrors == this.nTmpDocsSubmitted) {
                break;
              }
            }
          } 
          catch (Exception e) {
            // If an exception occurs log the error
            //DEBUG: don't log document-level errors
            //logger.error("Exception Message: " + e.getMessage(), e);
          } 
        }
      } // (end loop over feeds in a syndicate)
    }
    catch (Exception e) {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
    } 
  }


  private DocumentPojo buildDocument(String cleansedUrl, SyndEntry entry, SourcePojo source, LinkedList<String> duplicateSources) {


    // create the feed pojo
    DocumentPojo doc = new DocumentPojo();


    doc.setUrl(cleansedUrl);
    doc.setCreated(new Date());
    doc.setModified(doc.getCreated());


    // Strip out html if it is present
    if ( entry.getTitle() != null )
      doc.setTitle(entry.getTitle().replaceAll("\\<.*?\\>", "").trim());
    if ( entry.getDescription() != null )
      doc.setDescription(entry.getDescription().getValue().replaceAll("\\<.*?\\>", "").trim());
    if ( entry.getPublishedDate() != null ) {
      doc.setPublishedDate(entry.getPublishedDate());
    }
    else {
      doc.setPublishedDate(doc.getCreated());
    }


    // Clone from an existing source if we can:
    if (!duplicateSources.isEmpty() && (null == doc.getUpdateId())) { // (can't duplicate updating document)
      doc.setDuplicateFrom(duplicateSources.getFirst());
    }
    
    //GeoRSS
    GeoRSSModule geoRSSModule = GeoRSSUtils.getGeoRSS(entry); //currently does not handle <georss:circle>
    if (null != geoRSSModule)
    {
      if (null != geoRSSModule.getPosition())
      {
        double lat = geoRSSModule.getPosition().getLatitude();
        double lon = geoRSSModule.getPosition().getLongitude();
        GeoPojo gp = new GeoPojo();
        gp.lat = lat;
        gp.lon = lon;
        doc.setDocGeo(gp);
      }
      if (null != geoRSSModule.getGeometry())
      {
        AbstractGeometry ag = geoRSSModule.getGeometry();
        if(ag.getClass().equals(new LineString().getClass()))
        { //<georss:line>
          LineString ls = ((LineString)geoRSSModule.getGeometry());
          
          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = ls.getPositionList().size();
          for (int i = 0; i < length; i ++)
          {
            latAvg += ls.getPositionList().getLatitude(i);
            lonAvg += ls.getPositionList().getLongitude(i);
          }
          latAvg = latAvg/length;
          lonAvg = lonAvg/length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
        else if (ag.getClass().equals(new Polygon().getClass())) //<georss:polygon>
        {
          Polygon poly = ((Polygon)geoRSSModule.getGeometry());
          AbstractRing ar = poly.getExterior();
          LinearRing lr = (LinearRing)ar;


          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = lr.getPositionList().size();
          for (int i = 0; i < length; i ++)
          {
            latAvg += lr.getPositionList().getLatitude(i);
            lonAvg += lr.getPositionList().getLongitude(i);
          }
          latAvg = latAvg/length;
          lonAvg = lonAvg/length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
        else if(ag.getClass().equals(new Envelope().getClass()))
        { //<georss:box>
          Envelope env = ((Envelope)geoRSSModule.getGeometry());
          
          double latAvg = (env.getMaxLatitude()+env.getMinLatitude())/2;
          double lonAvg = (env.getMaxLongitude()+env.getMinLongitude())/2;


          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
      }
    }// end if GeoRSS
    
    // Arbitrary other metadata:


    if (null != entry.getForeignMarkup()) {
      JSONObject rssMetadata = new JSONObject();
      
      @SuppressWarnings("unchecked")
      List<Element> fms = (List<Element>) entry.getForeignMarkup();
      for (Element fm : fms) {
        try {
          JSONObject subObj = XML.toJSONObject(new XMLOutputter().outputString(fm));
          if (1 == subObj.length()) {
            for (String name: JSONObject.getNames(subObj)) {
              rssMetadata.put(name, subObj.get(name));              
            }
          }
          else { // (this will never happen in practice?)
            rssMetadata.put(fm.getName(), subObj);
          }
        } 
        catch (JSONException e) {} // (do nothing just carry on)
      }
      if (!fms.isEmpty()) {
        doc.addToMetadata("_FEED_METADATA_", XmlToMetadataParser.convertJsonObjectToLinkedHashMap(rssMetadata));
      }
    }//TESTED (longs converted to string, eg edgar:assistantDirector from "http.www.sec.gov.archives.edgar.usgaap.rss.xml")


    return doc;
  }


  // 
  // Utility function to trim whitespace (or anything - eg URL encoded +) from the start of URLs
  //
  private String cleanUrlStart(String url) 
  {
    url = url.replaceFirst("feed://", "http://");
    if (!url.startsWith("http://") && !url.startsWith("https://")) {
      int nIndex = url.indexOf("http://");
      if (-1 == nIndex) {
        nIndex = url.indexOf("https://");
        if (-1 == nIndex) {
          return "http://" + url.trim();
        }
      }
      url = url.substring(nIndex);
    }
    return url;    
  }


  @Override
  public boolean canHarvestType(int sourceType) {
    return sourceTypesCanHarvest.contains(sourceType);
  }
}
Source Code of com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester

Related Classes of com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester