Package org.archive.wayback.replay

Source Code of org.archive.wayback.replay.TagMagixTest

/* TagMagixTest
*
* $Id: TagMagixTest.java 2239 2008-04-15 00:01:01Z bradtofel $
*
* Created on 6:36:07 PM Feb 14, 2006.
*
* Copyright (C) 2006 Internet Archive.
*
* This file is part of wayback.
*
* wayback is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* wayback is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with wayback; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package org.archive.wayback.replay;

import org.archive.wayback.replay.TagMagix;
import org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter;

import junit.framework.TestCase;

/**
*
*
* @author brad
* @version $Date: 2008-04-15 01:01:01 +0100 (Ter, 15 Abr 2008) $, $Revision: 2239 $
*/
public class TagMagixTest extends TestCase {
 
  // snipped and modified from http://www.sudaneseonline.com/ on 20070418...
  // note: leading space in description META content
  // note: added newlines in Content-Language META tag
  // note: no quotes around Author META content
 
  String thePage = "<html>\n" +
  "<head>\n" +
  "<meta http-equiv=\"Content-Language\" \n   content=\"ar-eg\">\n" +
  "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1256\">\n" +
  "<meta name=\"resource-type\" content=\"document\">\n" +
  "<meta name=\"classification\" content=\"News\">\n" +
  "<meta name=\"test1234\" content=\"one\ntwo\">\n" +
  "<meta name=\"description\" content=\" A voice of the Sudan people on the  Internet\">\n" +

  "<meta http-equiv=\"Content-Language\" \n   content=\"ar-sa\">\n" +
  "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1256\">\n" +
  "<META NAME=\"Author\" CONTENT=Bakri Abubakr http://bayanit.com/>\n" +
  "<META NAME=\"Author2\" CONTENT=\"Bakri Abubakr http://bayanit.com/\">\n" +
  "</head>\n" +
  "<body>foo</body>\n" +
  "</html>\n";

  /**
   * Tests the code that finds attribute values in tags
   */
  public void testFindAttr() {
   
    checkAttrValue(thePage,"meta","http-equiv","Content-Language");
  }
  /**
   *
   */
  public void testFindAttrWhere() {
    checkAttrWhereValue(thePage,"meta","content","http-equiv",
        "Content-Type","text/html; charset=windows-1256");

    checkAttrWhereValue(thePage,"meta","content","http-equiv",
        "Content-Language","ar-eg");

    checkAttrWhereValue(thePage,"meta","content","name",
        "classification","News");

    checkAttrWhereValue(thePage,"meta","content","name",
        "test1234","one\ntwo");
   
    checkAttrWhereValue(thePage,"meta","content","name",
        "ClAsSification","News");

    checkAttrWhereValue(thePage,"meta","content","name",
        "description"," A voice of the Sudan people on the  Internet");

    checkAttrWhereValue(thePage,"meta","content","name",
        "description-no-existo",null);

    checkAttrWhereValue(thePage,"meta","content","name",
        "author","Bakri");

    checkAttrWhereValue(thePage,"meta","content","name",
        "author2","Bakri Abubakr http://bayanit.com/");
  }
 
  public void testFindEndOfFirst() {
    findEndOf("<head>","head",6);
    findEndOf("<html><head><body>","head",12);
    findEndOf("<html><head goo=bar><body>","head",20);
    findEndOf("<html><head goo=bar><body>full","body",26);
    findEndOf("<html><head goo=bar><body >full","body",27);
    findEndOf("<html><head goo=bar><body >full","body",27);
    findEndOf("<html><head goo=bar><body yar=bam>full","body",34);
    findEndOf("<html><head goo=bar><body yar='bam'>full","body",36);
    findEndOf("<html><head goo=bar><body yar=\"bam\">full","body",36);
  }
 
  public void findEndOf(String page, String tag, int offset) {
    StringBuilder sb = new StringBuilder(page);
    int found = TagMagix.getEndOfFirstTag(sb,tag);
    assertEquals("FAILED find end of " +tag+ " in ("+page+")",offset,found);
  }
 
  /**
   * Test method for 'org.archive.wayback.archivalurl.TagMagix.markupTag(StringBuffer, String, String, String, String, String)'
   */
  public void testMarkupTag() {

 
    // simple simple -- no quotes at all
    checkMarkup(
        "<A HREF=http://goofy.com/>",
        "<A HREF=http://web.archive.org/wayback/2004/http://goofy.com/>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // same test with lower case
    checkMarkup(
        "<a href=http://goofy.com/>",
        "<a href=http://web.archive.org/wayback/2004/http://goofy.com/>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // with funky mixed case
    checkMarkup(
        "<a hREF=http://goofy.com/>",
        "<a hREF=http://web.archive.org/wayback/2004/http://goofy.com/>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // more funky mixed case, this time in the attribute to replace argument
    checkMarkup(
        "<a hREF=http://goofy.com/>",
        "<a hREF=http://web.archive.org/wayback/2004/http://goofy.com/>",
        "A","HREF","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // another funky case permutation, this time in the tagname to replace
    checkMarkup(
        "<a hREF=http://goofy.com/>",
        "<a hREF=http://web.archive.org/wayback/2004/http://goofy.com/>",
        "a","HREF","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // with double quotes
    checkMarkup(
        "<A HREF=\"http://goofy.com/\">",
        "<A HREF=\"http://web.archive.org/wayback/2004/http://goofy.com/\">",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // single quotes
    checkMarkup(
        "<A HREF='http://goofy.com/'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://goofy.com/'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // two tags
    checkMarkup(
        "<A HREF='http://goofy.com/'><A HREF='http://goofier.com/'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://goofy.com/'><A HREF='http://web.archive.org/wayback/2004/http://goofier.com/'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // two tags with newline:
    checkMarkup(
        "<A HREF='http://goofy.com/'>\n<A HREF='http://goofier.com/'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://goofy.com/'>\n<A HREF='http://web.archive.org/wayback/2004/http://goofier.com/'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

   
    // two tags in "page" but only asking to update one of them
    checkMarkup(
        "<A HREF='http://goofy.com/'><B HREF='http://goofier.com/'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://goofy.com/'><B HREF='http://goofier.com/'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");
 
    // two tags, asking to update the other.
    checkMarkup(
        "<A HREF='http://goofy.com/'><B HREF='http://goofier.com/'>",
        "<A HREF='http://goofy.com/'><B HREF='http://web.archive.org/wayback/2004/http://goofier.com/'>",
        "B","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // simple path relative
    checkMarkup(
        "<A HREF='index.html'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://www.archive.org/index.html'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // simple server relative but irrelavant -- still at top level
    checkMarkup(
        "<A HREF='/index.html'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://www.archive.org/index.html'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/");

    // server relative but with non directory base url
    checkMarkup(
        "<A HREF='/index.html'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://www.archive.org/index.html'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir");

    // server relative being significant
    checkMarkup(
        "<A HREF='/index.html'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://www.archive.org/index.html'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // path relative with non-directory base url
    checkMarkup(
        "<A HREF='index.html'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://www.archive.org/index.html'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir");

    // path relative in subdirectory
    checkMarkup(
        "<A HREF='index.html'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://www.archive.org/dir/index.html'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // don't touch a "malformed" attribute (no closing apos)
    checkMarkup(
        "<A HREF='index.html>",
        "<A HREF='index.html>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // don't touch a "malformed" attribute (no differing quotes around attribute.)
    checkMarkup(
        "<A HREF='index.html\">",
        "<A HREF='index.html\">",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // same as last, but reversed: don't touch a "malformed" attribute (no differing quotes around attribute.)
    checkMarkup(
        "<A HREF=\"index.html'>",
        "<A HREF=\"index.html'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // newline in attribute
    checkMarkup(
        "<A HREF='/index.html'\n FOO='bar'>",
        "<A HREF='http://web.archive.org/wayback/2004/http://www.archive.org/index.html'\n FOO='bar'>",
        "A","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // newlines in attribute
    checkMarkup(
        "<link rel=\"stylesheet\"\n goo=\"1\"\n href=\"/_style/style.css\">",
        "<link rel=\"stylesheet\"\n goo=\"1\"\n href=\"http://web.archive.org/wayback/2004/http://www.archive.org/_style/style.css\">",
        "link","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");
   
    // newlines in attribute, plus extra
    checkMarkup(
        "<b><link rel=\"stylesheet\"\n goo=\"1\"\n href=\"/_style/style.css\"></b>",
        "<b><link rel=\"stylesheet\"\n goo=\"1\"\n href=\"http://web.archive.org/wayback/2004/http://www.archive.org/_style/style.css\"></b>",
        "link","href","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // newlines in attribute, plus extra, diff case
    checkMarkup(
        "<b><link rel=\"stylesheet\"\n goo=\"1\"\n href=\"/_style/style.css\"></b>",
        "<b><link rel=\"stylesheet\"\n goo=\"1\"\n href=\"http://web.archive.org/wayback/2004/http://www.archive.org/_style/style.css\"></b>",
        "LINK","HREF","http://web.archive.org/wayback/","2004","http://www.archive.org/dir/");

    // newlines in attribute, plus extra, diff case, no protocol
    checkMarkup(
        "<b><link rel=\"stylesheet\"\n goo=\"1\"\n href=\"/_style/style.css\"></b>",
        "<b><link rel=\"stylesheet\"\n goo=\"1\"\n href=\"http://web.archive.org/wayback/2004/http://archive.org/_style/style.css\"></b>",
        "LINK","HREF","http://web.archive.org/wayback/","2004","http://archive.org/dir/");
   
    // Javascript escaped quote attribute:
    checkMarkup(
         "document.write(\"<link rel=\\\"stylesheet\\\" type=\\\"text/css\\\" href=\\\"/css/print.css\\\" />\");",
         "document.write(\"<link rel=\\\"stylesheet\\\" type=\\\"text/css\\\" href=\\\"http://web.archive.org/wayback/2004/http://boogle.org/css/print.css\\\" />\");",
        "LINK","HREF","http://web.archive.org/wayback/","2004","http://boogle.org/dir/");
   
   
  }
 
  public void testCSSMarkup() {

    // basic, with quot apos + raw:
    checkCSSMarkup("@import url(http://foo.com/f.css);",
        "@import url(http://web.archive.org/wayback/2004/http://foo.com/f.css);",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url('http://foo.com/f.css');",
        "@import url('http://web.archive.org/wayback/2004/http://foo.com/f.css');",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url(\"http://foo.com/f.css\");",
        "@import url(\"http://web.archive.org/wayback/2004/http://foo.com/f.css\");",
        "http://web.archive.org/wayback/","2004","http://foo.com/");


    // same as basic, but with extra whitespace after "url"
    checkCSSMarkup("@import url (http://foo.com/f.css);",
        "@import url (http://web.archive.org/wayback/2004/http://foo.com/f.css);",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url\t('http://foo.com/f.css');",
        "@import url\t('http://web.archive.org/wayback/2004/http://foo.com/f.css');",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url\n(\"http://foo.com/f.css\");",
        "@import url\n(\"http://web.archive.org/wayback/2004/http://foo.com/f.css\");",
        "http://web.archive.org/wayback/","2004","http://foo.com/");

    // whitespace within url spec:
    checkCSSMarkup("@import url( http://foo.com/f.css);",
        "@import url( http://web.archive.org/wayback/2004/http://foo.com/f.css);",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url('http://foo.com/f.css' );",
        "@import url('http://web.archive.org/wayback/2004/http://foo.com/f.css' );",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url( \"http://foo.com/f.css\" );",
        "@import url( \"http://web.archive.org/wayback/2004/http://foo.com/f.css\" );",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url(\t\"http://foo.com/f.css\"\t);",
        "@import url(\t\"http://web.archive.org/wayback/2004/http://foo.com/f.css\"\t);",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url(\n\"http://foo.com/f.css\"\n);",
        "@import url(\n\"http://web.archive.org/wayback/2004/http://foo.com/f.css\"\n);",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import url(\r\n\"http://foo.com/f.css\"\n\r);",
        "@import url(\r\n\"http://web.archive.org/wayback/2004/http://foo.com/f.css\"\n\r);",
        "http://web.archive.org/wayback/","2004","http://foo.com/");

    checkCSSMarkup("@import \"http://foo.com/f.css\";",
        "@import \"http://web.archive.org/wayback/2004/http://foo.com/f.css\";",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
    checkCSSMarkup("@import 'http://foo.com/f.css';",
        "@import 'http://web.archive.org/wayback/2004/http://foo.com/f.css';",
        "http://web.archive.org/wayback/","2004","http://foo.com/");
   
  }
 
  public void testStyleUrlMarkup() {
    // simple, server relative
    checkStyleUrlMarkup("<table style=\"background: url(/css/b.gif)\"></table>",
        "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/");
    // server-relative, which now means something
    checkStyleUrlMarkup("<table style=\"background: url(/css/b.gif)\"></table>",
        "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/b/");

    // path relative:
    checkStyleUrlMarkup("<table style=\"background: url(css/b.gif)\"></table>",
        "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/");
    // path relative, meaningful:
    checkStyleUrlMarkup("<table style=\"background: url(css/b.gif)\"></table>",
        "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/b/css/b.gif)\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/b/");

    // absolute:
    checkStyleUrlMarkup("<table style=\"background: url(http://e.au/css/b.gif)\"></table>",
        "<table style=\"background: url(http://w.a.org/wb/2004/http://e.au/css/b.gif)\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/b/");

    // apos attribute
    checkStyleUrlMarkup("<table style='background: url(/css/b.gif)'></table>",
        "<table style='background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)'></table>",
        "http://w.a.org/wb/","2004","http://f.au/");
   
    // quote attribute, apos url:
    checkStyleUrlMarkup("<table style=\"background: url('/css/b.gif')\"></table>",
        "<table style=\"background: url('http://w.a.org/wb/2004/http://f.au/css/b.gif')\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/");

    // apos attribute, quote url:
    checkStyleUrlMarkup("<table style='background: url(\"/css/b.gif\")'></table>",
        "<table style='background: url(\"http://w.a.org/wb/2004/http://f.au/css/b.gif\")'></table>",
        "http://w.a.org/wb/","2004","http://f.au/");

    // apos attribute, quote url, plus semi-colon:
    checkStyleUrlMarkup("<table style='background: url(\"/css/b.gif\");'></table>",
        "<table style='background: url(\"http://w.a.org/wb/2004/http://f.au/css/b.gif\");'></table>",
        "http://w.a.org/wb/","2004","http://f.au/");

    // Two url()s in same attribute value:
    checkStyleUrlMarkup("<table style=\"bg: url(/css/b.gif); fg: url(/css/f.gif);\"></table>",
        "<table style=\"bg: url(http://w.a.org/wb/2004/http://f.au/css/b.gif); fg: url(http://w.a.org/wb/2004/http://f.au/css/f.gif);\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/");

    // Two url()s in same quoted attribute value, with embedded apos:
    checkStyleUrlMarkup("<table style=\"bg: url('/css/b.gif'); fg: url('/css/f.gif');\"></table>",
        "<table style=\"bg: url('http://w.a.org/wb/2004/http://f.au/css/b.gif'); fg: url('http://w.a.org/wb/2004/http://f.au/css/f.gif');\"></table>",
        "http://w.a.org/wb/","2004","http://f.au/");

    // Two url()s in same apos'ed attribute value, with embedded quote:
    checkStyleUrlMarkup("<table style='bg: url(\"/css/b.gif\"); fg: url(\"/css/f.gif\");'></table>",
        "<table style='bg: url(\"http://w.a.org/wb/2004/http://f.au/css/b.gif\"); fg: url(\"http://w.a.org/wb/2004/http://f.au/css/f.gif\");'></table>",
        "http://w.a.org/wb/","2004","http://f.au/");
//
//    NOT WORKING YET... Let's see if we get a complaint... Not even sure this
//    is legit HTML:
//   
//    // Two url()s in same quoted attribute value, with embedded escaped quote:
//    checkStyleUrlMarkup("<table style=\"bg: url(\\\"/css/b.gif\\\"); fg: url(\\\"/css/f.gif\\\");\"></table>",
//        "<table style=\"bg: url(\\\"http://w.a.org/wb/2004/http://f.au/css/b.gif\\\"); fg: url(\\\"http://w.a.org/wb/2004/http://f.au/css/f.gif\\\");\"></table>",
//        "http://w.a.org/wb/","2004","http://f.au/");
   
 
  }
 
 
  private void checkAttrValue(String page, String tag, String attr,
      String wantValue) {
    StringBuilder sb = new StringBuilder(page);
    String foundValue = TagMagix.getTagAttr(sb, tag, attr);
    assertEquals(foundValue,wantValue);
  }
  private void checkAttrWhereValue(String page, String tag, String attr,
      String whereAttr, String whereVal, String wantValue) {
    StringBuilder sb = new StringBuilder(page);
    String foundValue = TagMagix.getTagAttrWhere(sb, tag, attr, whereAttr,whereVal);
    if(foundValue != null) {
      assertEquals(foundValue,wantValue);
    } else {
      assertNull(wantValue);
    }
  }
 
  private void checkCSSMarkup(String orig, String want,String prefix, String ts, String url) {
    StringBuilder buf = new StringBuilder(orig);
    ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter();
    uriC.setReplayURIPrefix(prefix);
    TagMagix.markupCSSImports(buf, uriC, ts, url);
    String marked = buf.toString();
    assertEquals(want,marked);
  }
 
  private void checkStyleUrlMarkup(String orig, String want, String prefix, String ts, String url) {
    StringBuilder buf = new StringBuilder(orig);
    ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter();
    uriC.setReplayURIPrefix(prefix);
    TagMagix.markupStyleUrls(buf,uriC,ts,url);
    String marked = buf.toString();
    assertEquals(want,marked);
  }
 
  private void checkMarkup(String orig, String want, String tag, String attr, String prefix, String ts, String url) {
    StringBuilder buf = new StringBuilder(orig);
    ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter();
    uriC.setReplayURIPrefix(prefix);
    TagMagix.markupTagREURIC(buf,uriC,ts,url,tag,attr);
    String marked = buf.toString();
    assertEquals(want,marked);
  }
}
TOP

Related Classes of org.archive.wayback.replay.TagMagixTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.