Package org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser

Examples of org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser.Parser


public class TestHtmlParser extends LuceneTestCase {

  public void testUnicode() throws Exception {
    String text = "<html><body>汉语</body></html>";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("汉语", parser.body);
  }
View Full Code Here


    assertEquals("汉语", parser.body);
  }
 
  public void testEntities() throws Exception {
    String text = "<html><body>&#x6C49;&#x8BED;&yen;</body></html>";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("汉语¥", parser.body);
  }
View Full Code Here

    assertEquals("汉语¥", parser.body);
  }
 
  public void testComments() throws Exception {
    String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("foo", parser.body);
  }
View Full Code Here

  }
 
  public void testScript() throws Exception {
    String text = "<html><body><script type=\"text/javascript\">" +
                  "document.write(\"test\")</script>foo</body></html>";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("foo", parser.body);
  }
View Full Code Here

 
  public void testStyle() throws Exception {
    String text = "<html><head><style type=\"text/css\">" +
                  "body{background-color:blue;}</style>" +
                  "</head><body>foo</body></html>";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("foo", parser.body);
  }
View Full Code Here

  public void testDoctype() throws Exception {
    String text = "<!DOCTYPE HTML PUBLIC " +
    "\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
    "\"http://www.w3.org/TR/html4/loose.dtd\">" +
    "<html><body>foo</body></html>";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("foo", parser.body);
  }
View Full Code Here

    "<meta name=\"a\" content=\"1\" />" +
    "<meta name=\"b\" content=\"2\" />" +
    "<meta name=\"keywords\" content=\"this is a test\" />" +
    "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
    "</head><body>foobar</body></html>";
    Parser parser = new Parser(new StringReader(text));
    Properties tags = parser.metaTags;
    assertEquals(4, tags.size());
    assertEquals("1", tags.get("a"));
    assertEquals("2", tags.get("b"));
    assertEquals("this is a test", tags.get("keywords"));
View Full Code Here

    assertEquals("text/html;charset=UTF-8", tags.get("content-type"));
  }
 
  public void testTitle() throws Exception {
    String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("foo", parser.title);
  }
View Full Code Here

    try {
      Locale.setDefault(new Locale("tr", "TR"));
      String text = "<html><HEAD><TITLE>ııı</TITLE></head><body>" +
      "<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
      "<a title=\"(ııı)\"></body></html>";
      Parser parser = new Parser(new StringReader(text));
      assertEquals("ııı", parser.title);
      assertEquals("[ş]", parser.body);
    } finally {
      Locale.setDefault(saved);
    }
View Full Code Here

        "<body>\r\n" +
        "TEST-000 text\r\n" +
        "\r\n" +
        "</body>\r\n" +
        "\r\n";
    Parser parser = new Parser(new StringReader(text));
    assertEquals("TEST-000 title", parser.title);
    assertEquals("TEST-000 text", parser.body.trim());
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.