Source Code of com.crawljax.util.Helper

package com.crawljax.util;


import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;


import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.custommonkey.xmlunit.DetailedDiff;
import org.custommonkey.xmlunit.Diff;
import org.custommonkey.xmlunit.Difference;
import org.custommonkey.xmlunit.DifferenceListener;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;


import com.google.common.collect.Lists;


/**
 * Utility class that contains a number of helper functions used by Crawljax and some plugins.
 * 
 * @author mesbah
 * @version $Id: Helper.java 456 2010-11-05 15:04:57Z amesbah $
 */
public final class Helper {


  private static final int BASE_LENGTH = 3;


  private static final int TEXT_CUTOFF = 50;


  public static final Logger LOGGER = Logger.getLogger(Helper.class.getName());


  private Helper() {
  }


  /**
   * Internal used function to strip the basePath from a given url.
   * 
   * @param url
   *            the url to examine
   * @return the base path with file stipped
   */
  private static String getBasePath(URL url) {
    return url.getPath().replaceAll(url.getFile(), "");
  }


  /**
   * @param location
   *            Current location.
   * @param link
   *            Link to check.
   * @return Whether location and link are on the same domain.
   */
  public static boolean isLinkExternal(String location, String link) {


    if (!location.contains("://")) {
      // location must always contain :// by rule, it not link is handled as not external
      return false;
    }


    // This will jump out of the local file location
    if (location.startsWith("file") && link.startsWith("/")) {
      return true;
    }


    if (link.contains("://")) {
      if (location.startsWith("file") && link.startsWith("http") || link.startsWith("file")
              && location.startsWith("http")) {
        // Jump from file to http(s) or from http(s) to file, so external
        return true;
      }
      try {
        URL locationUrl = new URL(location);
        try {
          URL linkUrl = new URL(link);
          if (linkUrl.getHost().equals(locationUrl.getHost())) {
            String linkPath = getBasePath(linkUrl);
            return !(linkPath.startsWith(getBasePath(locationUrl)));
          }
          return true;
        } catch (MalformedURLException e) {
          LOGGER.info("Can not parse link " + link + " to check its externalOf "
                  + location);
          return false;
        }
      } catch (MalformedURLException e) {
        LOGGER.info("Can not parse location " + location + " to check if " + link
                + " isExternal", e);
        return false;
      }
    } else {
      // No full url specifier so internal link...
      return false;
    }
  }


  /**
   * @param url
   *            the URL string.
   * @return the base part of the URL.
   */
  public static String getBaseUrl(String url) {
    String head = url.substring(0, url.indexOf(":"));
    String subLoc = url.substring(head.length() + BASE_LENGTH);
    return head + "://" + subLoc.substring(0, subLoc.indexOf("/"));
  }


  /**
   * transforms a string into a Document object. TODO This needs more optimizations. As it seems
   * the getDocument is called way too much times causing a lot of parsing which is slow and not
   * necessary.
   * 
   * @param html
   *            the HTML string.
   * @return The DOM Document version of the HTML string.
   * @throws IOException
   *             if an IO failure occurs.
   * @throws SAXException
   *             if an exception occurs while parsing the HTML string.
   */
  public static Document getDocument(String html) throws SAXException, IOException {
    DOMParser domParser = new DOMParser();
    domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    domParser.setFeature("http://xml.org/sax/features/namespaces", false);
    domParser.parse(new InputSource(new StringReader(html)));
    return domParser.getDocument();
  }


  /**
   * @param html
   *            the HTML string.
   * @return a Document object made from the HTML string.
   * @throws SAXException
   *             if an exception occurs while parsing the HTML string.
   * @throws IOException
   *             if an IO failure occurs.
   */
  public static Document getDocumentNoBalance(String html) throws SAXException, IOException {
    DOMParser domParser = new DOMParser();
    domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    domParser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
    domParser.parse(new InputSource(new StringReader(html)));
    return domParser.getDocument();
  }


  /**
   * @param element
   *            The DOM Element.
   * @return A string representation of all the element's attributes.
   */
  public static String getAllElementAttributes(Element element) {
    return getElementAttributes(element, new ArrayList<String>());
  }


  /**
   * @param element
   *            The DOM Element.
   * @param exclude
   *            the list of exclude strings.
   * @return A string representation of the element's attributes excluding exclude.
   */
  public static String getElementAttributes(Element element, List<String> exclude) {
    StringBuffer buffer = new StringBuffer();


    if (element != null) {
      NamedNodeMap attributes = element.getAttributes();
      if (attributes != null) {
        for (int i = 0; i < attributes.getLength(); i++) {
          Attr attr = (Attr) attributes.item(i);
          if (!exclude.contains(attr.getNodeName())) {
            buffer.append(attr.getNodeName() + "=");
            buffer.append(attr.getNodeValue() + " ");
          }
        }
      }
    }


    return buffer.toString().trim();
  }


  /**
   * @param element
   *            the element.
   * @return a string representation of the element including its attributes.
   */
  public static String getElementString(Element element) {
    if (element == null) {
      return "";
    }
    String text = Helper.removeNewLines(Helper.getTextValue(element)).trim();
    String info = "";
    if (!text.equals("")) {
      info += "\"" + text + "\" ";
      // Helper.removeNewLines(this.text.trim()) + " - ";
    }
    if (element != null) {
      if (element.hasAttribute("id")) {
        info += "ID: " + element.getAttribute("id") + " ";
      }
      info += Helper.getAllElementAttributes(element) + " ";
    }
    return info;
  }


  /**
   * @param dom
   *            the DOM document.
   * @param xpath
   *            the xpath.
   * @return The element found on DOM having the xpath position.
   * @throws XPathExpressionException
   *             if the xpath fails.
   */
  public static Element getElementByXpath(Document dom, String xpath)
          throws XPathExpressionException {
    XPath xp = XPathFactory.newInstance().newXPath();
    xp.setNamespaceContext(new HtmlNamespace());


    return (Element) xp.evaluate(xpath, dom, XPathConstants.NODE);
  }


  /**
   * Removes all the <SCRIPT/> tags from the document.
   * 
   * @param dom
   *            the document object.
   * @return the changed dom.
   */
  public static Document removeScriptTags(Document dom) {
    return removeTags(dom, "SCRIPT");
  }


  /**
   * Removes all the given tags from the document.
   * 
   * @param dom
   *            the document object.
   * @param tagName
   *            the tag name, examples: script, style, meta
   * @return the changed dom.
   */
  public static Document removeTags(Document dom, String tagName) {
    if (dom != null) {
      // NodeList list = dom.getElementsByTagName("SCRIPT");


      NodeList list;
      try {
        list = XPathHelper.evaluateXpathExpression(dom, "//" + tagName.toUpperCase());


        while (list.getLength() > 0) {
          Node sc = list.item(0);


          if (sc != null) {
            sc.getParentNode().removeChild(sc);
          }


          list = XPathHelper.evaluateXpathExpression(dom, "//" + tagName.toUpperCase());
          // list = dom.getElementsByTagName("SCRIPT");
        }
      } catch (XPathExpressionException e) {
        LOGGER.error(e.getMessage(), e);
      }


      return dom;
    }


    return null;
  }


  /**
   * Checks the existence of the directory. If it does not exist, the method creates it.
   * 
   * @param dir
   *            the directory to check.
   * @throws IOException
   *             if fails.
   */
  public static void directoryCheck(String dir) throws IOException {
    final File file = new File(dir);


    if (!file.exists()) {
      FileUtils.forceMkdir(file);
    }
  }


  /**
   * Checks whether the folder exists for fname, and creates it if neccessary.
   * 
   * @param fname
   *            folder name.
   * @throws IOException
   *             an IO exception.
   */
  public static void checkFolderForFile(String fname) throws IOException {


    if (fname.lastIndexOf(File.separator) > 0) {
      String folder = fname.substring(0, fname.lastIndexOf(File.separator));
      Helper.directoryCheck(folder);
    }
  }


  /**
   * Retrieve the var value for varName from a HTTP query string (format is
   * "var1=val1&var2=val2").
   * 
   * @param varName
   *            the name.
   * @param haystack
   *            the haystack.
   * @return variable value for varName
   */
  public static String getVarFromQueryString(String varName, String haystack) {
    if (haystack == null || haystack.length() == 0) {
      return null;
    }
    if (haystack.charAt(0) == '?') {
      haystack = haystack.substring(1);
    }
    String[] vars = haystack.split("&");


    for (String var : vars) {
      String[] tuple = var.split("=");
      if (tuple.length == 2 && tuple[0].equals(varName)) {
        return tuple[1];
      }
    }
    return null;
  }


  /**
   * @param dom
   *            the DOM document.
   * @return a string representation of the DOM.
   */
  public static String getDocumentToString(Document dom) {
    try {
      Source source = new DOMSource(dom);
      StringWriter stringWriter = new StringWriter();
      Result result = new StreamResult(stringWriter);
      TransformerFactory factory = TransformerFactory.newInstance();
      Transformer transformer = factory.newTransformer();
      transformer.setOutputProperty(OutputKeys.INDENT, "yes");
      transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
      transformer.setOutputProperty(OutputKeys.METHOD, "html");
      transformer.transform(source, result);
      return stringWriter.getBuffer().toString();
    } catch (TransformerConfigurationException e) {
      LOGGER.error(e.getMessage(), e);
    } catch (TransformerException e) {
      LOGGER.error(e.getMessage(), e);
    }
    return null;


  }


  /**
   * Serialize the Document object.
   * 
   * @param dom
   *            the document to serialize
   * @return the serialized dom String
   */
  public static byte[] getDocumentToByteArray(Document dom) {
    try {
      TransformerFactory tFactory = TransformerFactory.newInstance();


      Transformer transformer = tFactory.newTransformer();
      transformer.setOutputProperty(OutputKeys.INDENT, "yes");
      transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
      transformer.setOutputProperty(OutputKeys.METHOD, "html");
      // TODO should be fixed to read doctype declaration
      // transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC,
      // "-//W3C//DTD XHTML 1.0 Transitional//EN\"
      // \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd");
      transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC,
              "-//W3C//DTD XHTML 1.0 Strict//EN\" "
                      + "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");


      // transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC,
      // "-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd");
      DOMSource source = new DOMSource(dom);


      ByteArrayOutputStream out = new ByteArrayOutputStream();
      Result result = new StreamResult(out);
      transformer.transform(source, result);


      // System.out.println("Injected Javascript!");
      return out.toByteArray();
    } catch (TransformerConfigurationException e) {
      LOGGER.error(e.getMessage(), e);
    } catch (TransformerException e) {
      LOGGER.error(e.getMessage(), e);
    }
    return null;


  }


  /**
   * Save a string to a file and append a newline character to that string.
   * 
   * @param filename
   *            The filename to save to.
   * @param text
   *            The text to save.
   * @param append
   *            Whether to append to existing file.
   * @throws IOException
   *             On error.
   */
  public static void writeToFile(String filename, String text, boolean append)
          throws IOException {
    FileWriter fw = new FileWriter(filename, append);
    try {
      fw.write(text + "\n");
    } catch (IOException e) {
      throw e;
    } finally {
      fw.close();
    }
  }


  /**
   * @param code
   *            hashcode.
   * @return String version of hashcode.
   */
  public static String hashCodeToString(long code) {
    if (code < 0) {
      return "0" + (code * -1);
    } else {
      return "" + code;
    }
  }


  /**
   * Returns the text value of an element (title, alt or contents). Note that the result is 50
   * characters or less in length.
   * 
   * @param element
   *            The element.
   * @return The text value of the element.
   */
  public static String getTextValue(Element element) {
    String ret = "";
    if (element == null) {
      return "";
    }


    if (element.getTextContent() != null) {
      ret = element.getTextContent();
    } else if (element.hasAttribute("title")) {
      ret = element.getAttribute("title");
    } else if (element.hasAttribute("alt")) {
      ret = element.getAttribute("alt");
    }
    if (ret.length() > TEXT_CUTOFF) {
      return ret.substring(0, TEXT_CUTOFF);
    } else {
      return ret;
    }
  }


  /**
   * Get differences between doms.
   * 
   * @param controlDom
   *            The control dom.
   * @param testDom
   *            The test dom.
   * @return The differences.
   */
  public static List<Difference> getDifferences(String controlDom, String testDom) {
    return getDifferences(controlDom, testDom, Lists.<String> newArrayList());
  }


  /**
   * Get differences between doms.
   * 
   * @param controlDom
   *            The control dom.
   * @param testDom
   *            The test dom.
   * @param ignoreAttributes
   *            The list of attributes to ignore.
   * @return The differences.
   */
  @SuppressWarnings("unchecked")
  public static List<Difference> getDifferences(String controlDom, String testDom,
          final List<String> ignoreAttributes) {
    try {
      Diff d = new Diff(Helper.getDocument(controlDom), Helper.getDocument(testDom));
      DetailedDiff dd = new DetailedDiff(d);
      dd.overrideDifferenceListener(new DifferenceListener() {


        @Override
        public void skippedComparison(Node control, Node test) {
        }


        @Override
        public int differenceFound(Difference difference) {
          if (difference.getControlNodeDetail() == null
                  || difference.getControlNodeDetail().getNode() == null
                  || difference.getTestNodeDetail() == null
                  || difference.getTestNodeDetail().getNode() == null) {
            return RETURN_ACCEPT_DIFFERENCE;
          }
          if (ignoreAttributes.contains(difference.getTestNodeDetail().getNode()
                  .getNodeName())
                  || ignoreAttributes.contains(difference.getControlNodeDetail()
                          .getNode().getNodeName())) {
            return RETURN_IGNORE_DIFFERENCE_NODES_IDENTICAL;
          }
          return RETURN_ACCEPT_DIFFERENCE;
        }
      });


      return dd.getAllDifferences();
    } catch (Exception e) {
      LOGGER.error("Error with getDifferences: " + e.getMessage(), e);
    }
    return null;
  }


  /**
   * Removes newlines from a string.
   * 
   * @param html
   *            The string.
   * @return The new string without the newlines or tabs.
   */
  public static String removeNewLines(String html) {
    return html.replaceAll("[\\t\\n\\x0B\\f\\r]", "");
  }


  /**
   * @param string
   *            The original string.
   * @param regex
   *            The regular expression.
   * @param replace
   *            What to replace it with.
   * @return replaces regex in str by replace where the dot sign also supports newlines
   */
  public static String replaceString(String string, String regex, String replace) {
    Pattern p = Pattern.compile(regex, Pattern.DOTALL);
    Matcher m = p.matcher(string);
    String replaced = m.replaceAll(replace);
    p = Pattern.compile("  ", Pattern.DOTALL);
    m = p.matcher(replaced);
    return m.replaceAll(" ");
  }


  /**
   * Adds a slash to a path if it doesn't end with a slash.
   * 
   * @param folderName
   *            The path to append a possible slash.
   * @return The new, correct path.
   */
  public static String addFolderSlashIfNeeded(String folderName) {
    if (!folderName.equals("") && !folderName.endsWith("/")) {
      return folderName + "/";
    } else {
      return folderName;
    }
  }


  /**
   * Returns the filename in a path. For example with path = "foo/bar/crawljax.txt" returns
   * "crawljax.txt"
   * 
   * @param path
   * @return the filename from the path
   */
  private static String getFileNameInPath(String path) {
    String fname;
    if (path.indexOf("/") != -1) {
      fname = path.substring(path.lastIndexOf("/") + 1);
    } else {
      fname = path;
    }
    return fname;
  }


  /**
   * Retrieves the content of the filename. Also reads from JAR Searches for the resource in the
   * root folder in the jar
   * 
   * @param fname
   *            Filename.
   * @return The contents of the file.
   * @throws IOException
   *             On error.
   */
  public static String getTemplateAsString(String fname) throws IOException {
    // in .jar file
    String fnameJar = getFileNameInPath(fname);
    InputStream inStream = Helper.class.getResourceAsStream("/" + fnameJar);
    if (inStream == null) {
      // try to find file normally
      File f = new File(fname);
      if (f.exists()) {
        inStream = new FileInputStream(f);
      } else {
        throw new IOException("Cannot find " + fname + " or " + fnameJar);
      }
    }


    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inStream));
    String line;
    StringBuilder stringBuilder = new StringBuilder();


    while ((line = bufferedReader.readLine()) != null) {
      stringBuilder.append(line + "\n");
    }


    bufferedReader.close();
    return stringBuilder.toString();
  }


  /**
   * @param xpath
   *            The xpath of the element.
   * @return The JavaScript to get an element.
   */
  public static String getJSGetElement(String xpath) {
    String js =
            ""
                    + "function ATUSA_getElementInNodes(nodes, tagName, number){"
                    + "try{"
                    + "var pos = 1;"
                    + "for(i=0; i<nodes.length; i++){"
                    + "if(nodes[i]!=null && nodes[i].tagName!=null && "
                    + "nodes[i].tagName.toLowerCase() == tagName){"
                    + "if(number==pos){"
                    + "return nodes[i];"
                    + "}else{"
                    + "pos++;"
                    + "}"
                    + "}"
                    + "}"
                    + "}catch(e){}"
                    + "return null;"
                    + "}"
                    + "function ATUSA_getElementByXpath(xpath){"
                    + "try{"
                    + "var elements = xpath.toLowerCase().split('/');"
                    + "var curNode = window.document.body;"
                    + "var tagName, number;"
                    + "for(j=0; j<elements.length; j++){"
                    + "if(elements[j]!=''){"
                    + "if(elements[j].indexOf('[')==-1){"
                    + "tagName = elements[j];"
                    + "number = 1;"
                    + "}else{"
                    + "tagName = elements[j].substring(0, elements[j].indexOf('['));"
                    + "number = elements[j].substring(elements[j].indexOf('[')+1, "
                    + "elements[j].lastIndexOf(']'));"
                    + "}"
                    + "if(tagName!='body' && tagName!='html'){"
                    + "curNode = ATUSA_getElementInNodes(curNode.childNodes, tagName, number);"
                    + "if(curNode==null){" + "return null;" + "}" + "}" + "}" + "}"
                    + "}catch(e){return null;}" + "return curNode;" + "}"
                    + "try{var ATUSA_element = ATUSA_getElementByXpath('" + xpath
                    + "');}catch(e){return null;}";


    return js;
  }


  /**
   * @param frame
   *            the frame element.
   * @return the name or id of this element if they are present, otherwise null.
   */
  public static String getFrameIdentification(Element frame) {


    Attr attr = frame.getAttributeNode("id");
    if (attr != null && attr.getNodeValue() != null && !attr.getNodeValue().equals("")) {
      return attr.getNodeValue();
    }


    attr = frame.getAttributeNode("name");
    if (attr != null && attr.getNodeValue() != null && !attr.getNodeValue().equals("")) {
      return attr.getNodeValue();
    }


    return null;


  }


  /**
   * Write the document object to a file.
   * 
   * @param document
   *            the document object.
   * @param filePathname
   *            the path name of the file to be written to.
   * @param method
   *            the output method: for instance html, xml, text
   * @param indent
   *            amount of indentation. -1 to use the default.
   * @throws TransformerException
   *             if an exception occurs.
   * @throws IOException
   *             if an IO exception occurs.
   */
  public static void writeDocumentToFile(Document document, String filePathname, String method,
          int indent) throws TransformerException, IOException {


    checkFolderForFile(filePathname);
    Transformer transformer = TransformerFactory.newInstance().newTransformer();
    transformer.setOutputProperty(OutputKeys.INDENT, "yes");
    transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
    transformer.setOutputProperty(OutputKeys.METHOD, method);


    if (indent > -1) {
      transformer.setOutputProperty(
              org.apache.xml.serializer.OutputPropertiesFactory.S_KEY_INDENT_AMOUNT,
              Integer.toString(indent));
    }
    transformer.transform(new DOMSource(document), new StreamResult(new FileOutputStream(
            filePathname)));
  }


  /**
   * Returns the file contents without stripping line-endings.
   * 
   * @param file
   *            File to read out.
   * @return Contents including line-endings.
   */
  public static String getContent(File file) {
    StringBuilder contents = new StringBuilder();


    try {
      BufferedReader input = new BufferedReader(new FileReader(file));
      try {
        String line = null; // not declared within while loop
        while ((line = input.readLine()) != null) {
          contents.append(line);
          contents.append("\n");
        }
      } finally {
        input.close();
      }
    } catch (IOException e) {
      e.printStackTrace();
    }


    return contents.toString();
  }


}
Source Code of com.crawljax.util.Helper

Related Classes of com.crawljax.util.Helper