Examples of UURI

org.archive.net.UURI
Usable URI. The bulk of the functionality of this class has moved to {@link UsableURI} in the archive-commons project. This class adds Kryoserialization.

Examples of org.archive.net.UURI

  public void setRequestUrl(String urlStr) throws URIException {
      if (!urlStr.startsWith("http://")) {
          urlStr = "http://" + urlStr;
      }
        // If its not http, next line throws exception. TODO: Fix.
      UURI requestURI = UURIFactory.getInstance(urlStr);
      put(WaybackConstants.REQUEST_URL_CLEANED, requestURI.toString());
        put(WaybackConstants.REQUEST_URL, urlStr);
  }

View Full Code Here

Examples of org.archive.net.UURI

  public static String resolveUrl(String baseUrl, String url) {
    // TODO: this only works for http://
    if(url.startsWith("http://")) {
      return url;
    }
    UURI absBaseURI;
    UURI resolvedURI = null;
    try {
      absBaseURI = UURIFactory.getInstance(baseUrl);
      resolvedURI = UURIFactory.getInstance(absBaseURI, url);
    } catch (URIException e) {
      e.printStackTrace();
      return url;
    }
    return resolvedURI.getEscapedURI();
  }

View Full Code Here

Examples of org.archive.net.UURI

        exactHostFlag.equals(WaybackConstants.REQUEST_YES)) {


      String searchUrl = wbRequest.get(WaybackConstants.REQUEST_URL);
      try {


        UURI searchURI = UURIFactory.getInstance(searchUrl);
        String exactHost = searchURI.getHost();
        filter = new HostMatchFilter(exactHost);


      } catch (URIException e) {
        // Really, this isn't gonna happen, we've already canonicalized
        // it... should really optimize and do that just once.

View Full Code Here

Examples of org.archive.net.UURI

    // was the only easy way I could find to get the correct unescaping
    // out of UURIs, possible a bug. Definitely needs some TLC in any case,
    // as building UURIs is *not* a cheap operation.
    
    // unescape anything that can be:
    UURI tmpURI = UURIFactory.getInstance(searchUrl);
    tmpURI.setPath(tmpURI.getPath());
    
    // convert to UURI to perform required URI fixup:
    UURI searchURI = UURIFactory.getInstance(tmpURI.getURI());
    
    // replace ' ' with '+' (this is only to match Alexa's canonicalization)
    String newPath = searchURI.getEscapedPath().replace("%20","+");
    
    // replace multiple consecutive '/'s in the path.
    while(newPath.contains("//")) {
      newPath = newPath.replace("//","/");
    }
    
    // this would remove trailing a '/' character, unless the path is empty
    // but we're not going to do this just yet..
//    if((newPath.length() > 1) && newPath.endsWith("/")) {
//      newPath = newPath.substring(0,newPath.length()-1);
//    }
//    searchURI.setEscapedPath(newPath);
//    searchURI.setRawPath(newPath.toCharArray());
//    String query = searchURI.getEscapedQuery();
    
    // TODO: handle non HTTP port stripping, too.
//    String portStr = "";
//    if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
//      portStr = ":" + searchURI.getPort();
//    }
//    return searchURI.getHostBasename() + portStr + 
//    searchURI.getEscapedPathQuery();
    
    StringBuilder sb = new StringBuilder(searchUrl.length());
    sb.append(searchURI.getHostBasename());
    if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
      sb.append(":").append(searchURI.getPort());
    }
    sb.append(newPath);
    if(searchURI.getEscapedQuery() != null) {
      sb.append("?").append(searchURI.getEscapedQuery());
    }


    return sb.toString();
  }

View Full Code Here

Examples of org.archive.net.UURI

    String arcFileName = tokens[8];


    String origUrl = url;
    if(!url.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
      try {
        UURI uri = UURIFactory.getInstance(
            WaybackConstants.HTTP_URL_PREFIX + url);
        if(uri.getPort() != -1) {
          origHost += ":" + uri.getPort();
        }
        origUrl = origHost + uri.getEscapedPathQuery();
      } catch (URIException e) {
        // TODO Stifle? throw an error?
        e.printStackTrace();
        return null;
      }

View Full Code Here

Examples of org.archive.net.UURI


    result.put(WaybackConstants.RESULT_URL, urlStr);
    result.put(WaybackConstants.RESULT_URL_KEY, urlStr);


  
    UURI uri = UURIFactory.getInstance(urlStr);
    String uriHost = uri.getHost();
    if (uriHost == null) {


      LOGGER.info("No host in " + urlStr);


    } else {

View Full Code Here

Examples of org.archive.net.UURI

        transformWarcFilename(header.getReaderIdentifier()));
    result.put(WaybackConstants.RESULT_OFFSET, 
        String.valueOf(header.getOffset()));
    
    String origUrl = header.getUrl();
    UURI uri = addUrlDataToSearchResult(result,origUrl);


    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..
    
        byte [] statusBytes = HttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one " +
                " was expected: " + new String(statusBytes));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0,
            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) ||
                !StatusLine.startsWithHTTP(statusLine)) {
           throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);
    result.put(WaybackConstants.RESULT_HTTP_CODE, 
        String.valueOf(status.getStatusCode()));
        
    Header[] headers = HttpParser.parseHeaders(rec,
                ARCConstants.DEFAULT_ENCODING);


    rec.close();
    result.put(WaybackConstants.RESULT_MD5_DIGEST, 
        transformDigest(header.getHeaderValue(
            WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));


    if (headers != null) {
  
      for (Header httpHeader : headers) {
        if (httpHeader.getName().equals(
            WaybackConstants.LOCATION_HTTP_HEADER)) {
  
          String locationStr = httpHeader.getValue();
          // TODO: "Location" is supposed to be absolute:
          // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
          // (section 14.30) but Content-Location can be
          // relative.
          // is it correct to resolve a relative Location, as
          // we are?
          // it's also possible to have both in the HTTP
          // headers...
          // should we prefer one over the other?
          // right now, we're ignoring "Content-Location"
          try {
            UURI uriRedirect = UURIFactory.getInstance(uri,
                locationStr);
            result.put(WaybackConstants.RESULT_REDIRECT_URL,
                uriRedirect.getEscapedURI());
          } catch (URIException e) {
            LOGGER.info("Bad Location: " + locationStr
                + " for " + origUrl + " in "
                + header.getReaderIdentifier() + " Skipped");
          }

View Full Code Here

Examples of org.archive.net.UURI

      result.put(WaybackConstants.RESULT_URL, uriStr);
      result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
    
    } else {
    
      UURI uri = UURIFactory.getInstance(uriStr);
      result.put(WaybackConstants.RESULT_URL, uriStr);
    
      String uriHost = uri.getHost();
      if (uriHost == null) {
        LOGGER.info("No host in " + uriStr + " in " + meta.getArc());
      } else {
        result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
    
        String statusCode = (meta.getStatusCode() == null) ? "-" : meta
            .getStatusCode();
        result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode);
    
        String redirectUrl = "-";
        Header[] headers = rec.getHttpHeaders();
        if (headers != null) {
    
          for (int i = 0; i < headers.length; i++) {
            if (headers[i].getName().equals(
                WaybackConstants.LOCATION_HTTP_HEADER)) {


              String locationStr = headers[i].getValue();
              // TODO: "Location" is supposed to be absolute:
              // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
              // (section 14.30) but Content-Location can be
              // relative.
              // is it correct to resolve a relative Location, as
              // we are?
              // it's also possible to have both in the HTTP
              // headers...
              // should we prefer one over the other?
              // right now, we're ignoring "Content-Location"
              try {
                UURI uriRedirect = UURIFactory.getInstance(uri,
                    locationStr);
                redirectUrl = uriRedirect.getEscapedURI();
    
              } catch (URIException e) {
                LOGGER.info("Bad Location: " + locationStr
                    + " for " + uriStr + " in "
                    + meta.getArc() + " Skipped");

View Full Code Here

Examples of org.archive.net.UURI

  
  private static String getKey(String url, boolean prefix)
  throws URIException {


    String key = ArchiveUtils.addImpliedHttpIfNecessary(url);
    UURI uuri = UURIFactory.getInstance(key);
    key = uuri.getScheme() + "://" + uuri.getAuthority() + 
      uuri.getEscapedPathQuery();


    key = SURT.fromURI(key);
    
    int hashPos = key.indexOf('#');
    if(hashPos != -1) {

View Full Code Here

Examples of org.archive.net.UURI

        testHostServer(servers, "dns://www.example.com:9090");
    }
    
    private void testHostServer(DefaultServerCache servers, String uri)
    throws URIException {
        UURI uuri = UURIFactory.getInstance(uri);
        servers.getServerFor(uuri);
        servers.getHostFor(uuri);
        assertTrue("cache lost server",
            servers.containsServer(CrawlServer.getServerKey(uuri)));
        assertTrue("cache lost host",
            servers.containsHost(uuri.getHost()));
    }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.