Package eu.planets_project.ifr.core.storage.impl.web

Source Code of eu.planets_project.ifr.core.storage.impl.web.YahooImageAPIDigitalObjectManagerImpl

package eu.planets_project.ifr.core.storage.impl.web;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import eu.planets_project.ifr.core.storage.api.DigitalObjectManager;
import eu.planets_project.ifr.core.storage.api.query.Query;
import eu.planets_project.ifr.core.storage.api.query.QueryString;
import eu.planets_project.ifr.core.storage.api.query.QueryValidationException;
import eu.planets_project.services.datatypes.Content;
import eu.planets_project.services.datatypes.DigitalObject;

/**
* Implements the DigitalObjectManager interface for the Yahoo Image API. This
* DigitalObjectManager is read-only and queryable with a QueryString.
*
* This implementation is lazy-loading:
*
* - When list is first called, a batch of 50 query results (which is the
*   maximum allowed size for an API call) is retrieved. The total number
*   of available results (which typically exceeds 50 by far) is extracted
*   from the XML response.
*  
* - All results are buffered in memory
*
* - Additional results are retrieved via HTTP only if needed when the
*   get() Method of the YahooResultList implementation is called.
*  
* - Room for improvement: additional query results are always added to the
*   in-memory buffer. The buffer is never flushed.
*  
* - Room for improvement: results are always loaded subsequently - no gaps
*   allowed. I.e. if index 10.000 is retrieved, 1-9.999 will be downloaded
*   before.
*
* @author SimonR
*
*/
public class YahooImageAPIDigitalObjectManagerImpl implements DigitalObjectManager {
 
    /**
     * Logger.
     */
    private static Logger log = Logger.getLogger(eu.planets_project.ifr.core.storage.impl.web.SimpleSRUDigitalObjectManagerImpl.class.getName());

  /**
     * API base URL (incl. Yahoo App ID).
     */
    private static String API_BASE_URL = "http://search.yahooapis.com/ImageSearchService/V1/imageSearch";
    private static String Y_APP_ID = "Oc5vBjrV34EdL30ngS_5VnW9PVk0jRSkwyzQO0IDDNXCsBJE4OSq5NE1NF4FToohppPX";
    private static String BASE_URL = API_BASE_URL + "?appid=" + Y_APP_ID + "&";
   
    /**
     * HttpClient timeout in ms.
     */
    private static final int TIMEOUT = 10000;
 
    public URI storeAsNew(URI pdURI, DigitalObject digitalObject) throws eu.planets_project.ifr.core.storage.api.DigitalObjectManager.DigitalObjectNotStoredException {
        throw new DigitalObjectNotStoredException("Storing not supported by this implementation.");
    }
   
    public URI storeAsNew(DigitalObject digitalObject) throws eu.planets_project.ifr.core.storage.api.DigitalObjectManager.DigitalObjectNotStoredException {
        throw new DigitalObjectNotStoredException("Storing not supported by this implementation.");
    }

    public URI updateExisting(URI pdURI, DigitalObject digitalObject) throws eu.planets_project.ifr.core.storage.api.DigitalObjectManager.DigitalObjectNotStoredException, eu.planets_project.ifr.core.storage.api.DigitalObjectManager.DigitalObjectNotFoundException {
        throw new DigitalObjectNotStoredException("Storing not supported by this implementation.");
    }

    /**
     * {@inheritDoc}
     * @see eu.planets_project.ifr.core.storage.api.DigitalObjectManager#isWritable(java.net.URI)
     */
    public boolean isWritable( URI pdURI ) {
      return false;
    }
 
    /**
     * {@inheritDoc}
     * @see eu.planets_project.ifr.core.storage.api.DigitalObjectManager#list(java.net.URI)
     */
    public List<URI> list(URI pdURI) {
      // list() without query not supported - empty result list
      return new YahooResultList(null);
    }

  /**
   * {@inheritDoc}
   * @see eu.planets_project.ifr.core.storage.api.DigitalObjectManager#retrieve(java.net.URI)
   */
  public DigitalObject retrieve(URI pdURI) throws DigitalObjectNotFoundException {
    try {
      // Will simply attempt to download the object at the provided URI,
      // no matter whether it was part of the query result or not
      return new DigitalObject.Builder(Content.byReference(pdURI.toURL())).title(pdURI.getPath().substring( pdURI.getPath().lastIndexOf('/')+1)).build();
    } catch (Exception e) {
      throw new DigitalObjectNotFoundException("Error retrieving object from " + pdURI.toString() + " (" + e.getMessage() + ")");
    }
  }

  /**
   * {@inheritDoc}
   * @see eu.planets_project.ifr.core.storage.api.DigitalObjectManager#getQueryTypes()
   */
  public List<Class<? extends Query>> getQueryTypes(){
    ArrayList<Class<? extends Query>> qTypes = new ArrayList<Class<? extends Query>>();
    qTypes.add(Query.STRING);
    return qTypes;
  }

    /**
     * {@inheritDoc}
     * @see eu.planets_project.ifr.core.storage.api.DigitalObjectManager#list(java.net.URI, eu.planets_project.ifr.core.storage.api.query.Query)
     */
    public List<URI> list(URI pdURI, Query q) throws QueryValidationException {
      if (q == null) {
        // list() without query not supported - empty result list
        throw new QueryValidationException("null query not allowed");
      }
     
      if (pdURI == null) {
        // Hierarchy is flat (no sub-directories) - only allow 'null' as pdURI!
           if (q instanceof QueryString) {
             return new YahooResultList((QueryString) q);
           } else {
             throw new QueryValidationException("Unsupported query type");
           }
      } else {
        return new YahooResultList(null);
      }
  }
 
  /**
   * Yahoo result list representation.
   *
   */
  public class YahooResultList extends AbstractList<URI> {

      /**
       * Query string.
       */
    private String queryString = null;
   
      /**
       * The HTTP client.
       */
      private HttpClient httpClient = new HttpClient();
     
      /**
       * ArrayList storing all retrieved URIs so far.
       */
      private ArrayList<YahooResult> bufferedQueryResults = new ArrayList<YahooResult>();
     
      /**
       * Total number of results.
       */
      private int size = 0;
   
    /**
     * @param query The query
     */
    public YahooResultList(QueryString query) {
      // Set query string
      if (query != null)
        this.queryString = query.getQuery();
     
        // Set up HTTP client
        String host = System.getProperty("http.proxyHost");
          String port = System.getProperty("http.proxyPort");
          if( host != null && port != null ) {
              httpClient.getHostConfiguration().setProxy(host, Integer.parseInt(port));
          }
      httpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(1, false));
      httpClient.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, Integer.valueOf(TIMEOUT));
      httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(TIMEOUT);
      httpClient.getHttpConnectionManager().getParams().setSoTimeout(TIMEOUT);
     
      // Download first 50 results (50 = max. result size for Yahoo API)
      if (queryString != null)
        this.size = nextFiftyResults(0);
    }
   
    /**
     * {@inheritDoc}
     * @see java.util.AbstractList#get(int)
     */
    public URI get(int index) {
      if (index >= bufferedQueryResults.size()) {
        // Not yet in cache - lazy load
        System.out.println("getting index " + index);
        int retrievals = (index - bufferedQueryResults.size()) / 50 + 1;
        System.out.println("lazy loading - # of retrievals: " + retrievals);
        for (int i=0; i<retrievals; i++) {
          System.out.println("downloading results " + bufferedQueryResults.size() + " to " + (bufferedQueryResults.size() + 50));
          nextFiftyResults(bufferedQueryResults.size());
        }
      }

      return bufferedQueryResults.get(index).uri;
    }
   
    /**
     * {@inheritDoc}
     * @see java.util.AbstractCollection#size()
     */
    public int size() {
      return this.size;
    }
   
    private int nextFiftyResults(int offset) {
        try {      
          // Fire GET request to Yahoo image search API
          String url = BASE_URL + "query=" + URLEncoder.encode(queryString, "UTF-8") + "&results=50&start=" + offset;      
        GetMethod yahooRequest = new GetMethod(url);
        yahooRequest.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, Integer.valueOf(TIMEOUT));
        httpClient.executeMethod(yahooRequest);
       
        // Create XML DOM
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder docBuilder = factory.newDocumentBuilder();
            Document dom = docBuilder.parse(yahooRequest.getResponseBodyAsStream());
           
        // Parse DOM
            return parseDom(dom);
        } catch (UnsupportedEncodingException e) {
          log.severe(e.getClass() + ": " + e.getMessage());
        } catch (IOException e) {
          log.severe(e.getClass() + ": " + e.getMessage());
        } catch (SAXException e) {
          log.severe(e.getClass() + ": " + e.getMessage());
        } catch (ParserConfigurationException e) {
          log.severe(e.getClass() + ": " + e.getMessage());
        }
        return 0;
    }
   
      private int parseDom(Document dom) {       
        // Create a node iterator
          Node root = dom.getDocumentElement();
      
          // Parse number of available results
          int totalResultsAvailable = 0;
          if (root.getNodeType() == Node.ELEMENT_NODE) {
            try {
              totalResultsAvailable = Integer.parseInt(((Element) root).getAttribute("totalResultsAvailable"));
            } catch (Exception e) {
              log.severe("Number of total results available not found");
            }
          }
         
          if (totalResultsAvailable == 0)
            return 0;
         
          NodeList results = root.getChildNodes();
         
          // Walk the DOM for all <result> elements
          Element aResult;
          NodeList resultFragment;
          Element aResultFragmentChild;

          for (int i=0; i<results.getLength(); i++) {
            if (results.item(i).getNodeType() == Node.ELEMENT_NODE) {
              aResult = (Element) results.item(i);
              if (aResult.getNodeName().equalsIgnoreCase("result")) {
                // Node is a <result> element node
                YahooResult result = new YahooResult();
               
                resultFragment = aResult.getChildNodes();
                for (int j=0; j<resultFragment.getLength(); j++) {
                  if (resultFragment.item(j).getNodeType() == Node.ELEMENT_NODE) {
                    aResultFragmentChild = (Element) resultFragment.item(j);
                   
                    // URL
                    if (aResultFragmentChild.getNodeName().equalsIgnoreCase("url")) {
                      try {
                        result.uri = new URI(aResultFragmentChild.getFirstChild().getNodeValue().trim());
                      } catch (URISyntaxException e) {
                        // Do nothing
                      }
                     
                    // Title
                    } else if (aResultFragmentChild.getNodeName().equalsIgnoreCase("title")) {
                      result.title = aResultFragmentChild.getFirstChild().getNodeValue().trim();
                     
                    // Filesize
                    } else if (aResultFragmentChild.getNodeName().equalsIgnoreCase("filesize")) {
                      try {
                        double size = Integer.parseInt(aResultFragmentChild.getFirstChild().getNodeValue().trim());
                        size = (int) (size / 10.24);
                        result.fileSize = (int) (size /= 100);
                      } catch (Exception e) {
                        // Do nothing
                      }
                     
                    // Fileformat
                    } else if (aResultFragmentChild.getNodeName().equalsIgnoreCase("fileformat")) {
                      result.format = aResultFragmentChild.getFirstChild().getNodeValue().trim();
                    }
                  }
                }
               
                if ((result.uri != null) && (result.title != null))
                  bufferedQueryResults.add(result);

              }
            }
          }
     
        return totalResultsAvailable;
      }
   
  }
 
  // A simple class to wrap a query result URL
  // with a little bit of metadata. (Might be
  // extended in thefuture?)
  @SuppressWarnings( "unused" )
  private class YahooResult {
    URI uri = null;       // URI
    String title = null// Title
        int fileSize;         // File size in kByte
    String format = null; // File format by extension
  }
 
  /*
  public static void main(String args[]) {
    System.out.println("starting...");
    YahooImageAPIDigitalObjectManagerImpl impl = new YahooImageAPIDigitalObjectManagerImpl();
    impl.setQuery(new QueryString("planets"));
    List<URI> results = impl.list(null);
    System.out.println(results.size() + " results");

    List<URI> sublist = results.subList(298, 310);
    for (URI uri : sublist)
      System.out.println(uri);
   
    System.out.println("done.");
  }
  */

}
 
TOP

Related Classes of eu.planets_project.ifr.core.storage.impl.web.YahooImageAPIDigitalObjectManagerImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.