Package com.subgraph.vega.internal.analysis.urls

Source Code of com.subgraph.vega.internal.analysis.urls.HtmlUrlExtractor

/*******************************************************************************
* Copyright (c) 2011 Subgraph.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
*     Subgraph - initial API and implementation
******************************************************************************/
package com.subgraph.vega.internal.analysis.urls;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.utils.URIUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.subgraph.vega.api.html.IHTMLParseResult;
import com.subgraph.vega.api.http.requests.IHttpResponse;
import com.subgraph.vega.api.util.VegaURI;

public class HtmlUrlExtractor {
 
  List<VegaURI> findHtmlUrls(IHttpResponse response) {
    final IHTMLParseResult htmlParseResult = response.getParsedHTML();
   
    if(htmlParseResult != null) {
      return extractUrlsFromDocument(htmlParseResult.getJsoupDocument());
    } else {
      return Collections.emptyList();
    }
  }
 
  List<VegaURI> findHtmlUrls(HttpEntity entity, URI basePath) throws IOException {
    final String htmlString = inputStreamToString(entity.getContent());
    final Document document = Jsoup.parse(htmlString, basePath.toString());
    return extractUrlsFromDocument(document);
  }
 
  private List<VegaURI> extractUrlsFromDocument(Document document) {
    final ArrayList<VegaURI> uris = new ArrayList<VegaURI>();
    uris.addAll(extractURIs(document, "a[href]", "abs:href"));
    uris.addAll(extractURIs(document, "[src]", "abs:src"));
    uris.addAll(extractURIs(document, "link[href]", "abs:href"));
    return uris;
  }
 
  private String inputStreamToString(InputStream in) throws IOException {
    final Reader r = new InputStreamReader(in, "UTF-8");
    final StringWriter w = new StringWriter();
    final char[] buffer = new char[8192];
    while(true) {
      int n = r.read(buffer, 0, buffer.length);
      if(n <= 0)
        return w.toString();
      w.write(buffer, 0, n);
    }
  }
  private List<VegaURI> extractURIs(Document document, String query, String attribute) {
    final ArrayList<VegaURI> uris = new ArrayList<VegaURI>();
    for(Element e: document.select(query)) {
      String link = e.attr(attribute);
      link = link.replace("\\", "%5C");
      URI uri = createURI(link);
      if(uri != null && hasValidHttpScheme(uri)) {
        final HttpHost targetHost = URIUtils.extractHost(uri);
        if(validateHost(targetHost)) {
          final VegaURI vegaURI = new VegaURI(targetHost, uri.getPath(), uri.getQuery());
          uris.add(vegaURI);
        }
      }
    }
    return uris;
  }
 
  private boolean validateHost(HttpHost host) {
    if(host.getHostName() == null || host.getHostName().isEmpty()) {
      return false;
    }
    try {
      new URI(host.getSchemeName(), null, host.getHostName(), host.getPort(), null, null, null);
      return true;
    } catch (URISyntaxException e) {
      return false;
    }
  }

  private boolean hasValidHttpScheme(URI uri) {
    final String scheme = uri.getScheme();
    return (scheme != null && (scheme.equalsIgnoreCase("http") || scheme.equalsIgnoreCase("https")));
  }
  private URI createURI(String link) {
    try {
      if(link.isEmpty())
        return null;
      return new URI(link);
    } catch (URISyntaxException ex) {
      return null;
    }
  }
}
TOP

Related Classes of com.subgraph.vega.internal.analysis.urls.HtmlUrlExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.