Examples of org.htmlparser.Parser

org.htmlparser.Parser

ahoo.com",new DefaultHTMLParserFeedback()); // In this example, we are registering all the common scanners parser.registerScanners(); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); node.print(); } Below is some sample code to parse Yahoo.com and print only the text information. This scanning will run faster, as there are no scanners registered here.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); // In this example, none of the scanners need to be registered // as a string node is not a tag to be scanned for. for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); if (node instanceof StringNode) {        	 StringNode stringNode = (StringNode)node;         System.out.println(stringNode.getText());     }  }

The above snippet will print out only the text contents in the html document.
Here's another snippet that will only print out the link urls in a document. This is an example of adding a link scanner.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); parser.addScanner(new LinkScanner("-l")); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode();     if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag)node;         System.out.println(linkTag.getLink());     }  }

@see Parser#elements()

                            || linkTag.getLink().toUpperCase().indexOf("ORG")
                                != -1)
                        {
                            if (crawlDepth > 0)
                            {
                                Parser newParser =
                                    new Parser(
                                        linkTag.getLink(),
                                        new DefaultParserFeedback());
                                newParser.registerScanners();
                                System.out.print(
                                    "Crawling to " + linkTag.getLink());
                                crawl(newParser, crawlDepth - 1);
                            }
                            else

View Full Code Here

    /**
     * This testcase needs you to be online.
     */
    public void testElementsFromWeb() throws Exception
    {
        Parser parser;
        try
        {
            parser = new Parser("http://www.google.com");
        }
        catch (Exception e)
        {
            throw new ParserException(
                "You must be offline! This test needs you to be connected to the internet.",
                e);
        }
        parser.getReader().mark(5000);


        Node[] node = new Node[500];
        int i = 0;
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
        {
            node[i++] = e.nextNode();
        }
        int cnt = i;
        parser.getReader().reset();
        // Now try getting the elements again
        i = 0;
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
        {
            node[i++] = e.nextNode();
        }
        assertEquals(
            "There should be "

View Full Code Here

     */
    public MailRipper(String resourceLocation)
    {
        try
        {
            parser = new Parser(resourceLocation, new DefaultParserFeedback());
            parser.registerScanners();
        }
        catch (ParserException e)
        {
            System.err.println("Could not create parser object");

View Full Code Here

        final String city = "Ottawa";
        final String province = "ON";
        // the correct answer
        final String postal_code = "K2B 7V4";


        Parser parser;
        URL url;
        HttpURLConnection connection;
        StringBuffer buffer;
        PrintWriter out;
        boolean pass;
        NodeIterator enumeration;
        Node node;
        StringNode string;


        try
        {
            url =
                new URL("http://www.canadapost.ca/tools/pcl/bin/cp_search_response-e.asp");
            connection = (HttpURLConnection) url.openConnection();
            connection.setRequestMethod("POST");
            connection.setRequestProperty(
                "Referer",
                "http://www.canadapost.ca/tools/pcl/bin/default-e.asp");
            connection.setDoOutput(true);
            connection.setDoInput(true);
            connection.setUseCaches(false);
            buffer = new StringBuffer(1024);
            buffer.append("app_language=");
            buffer.append("english");
            buffer.append("&");
            buffer.append("app_response_start_row_number=");
            buffer.append("1");
            buffer.append("&");
            buffer.append("app_response_rows_max=");
            buffer.append("9");
            buffer.append("&");
            buffer.append("app_source=");
            buffer.append("quick");
            buffer.append("&");
            buffer.append("query_source=");
            buffer.append("q");
            buffer.append("&");
            buffer.append("name=");
            buffer.append("&");
            buffer.append("postal_code=");
            buffer.append("&");
            buffer.append("directory_area_name=");
            buffer.append("&");
            buffer.append("delivery_mode=");
            buffer.append("&");
            buffer.append("Suffix=");
            buffer.append("&");
            buffer.append("street_direction=");
            buffer.append("&");
            buffer.append("installation_type=");
            buffer.append("&");
            buffer.append("delivery_number=");
            buffer.append("&");
            buffer.append("installation_name=");
            buffer.append("&");
            buffer.append("unit_numbere=");
            buffer.append("&");
            buffer.append("app_state=");
            buffer.append("production");
            buffer.append("&");
            buffer.append("street_number=");
            buffer.append(number);
            buffer.append("&");
            buffer.append("street_name=");
            buffer.append(street);
            buffer.append("&");
            buffer.append("street_type=");
            buffer.append(type);
            buffer.append("&");
            buffer.append("test=");
            buffer.append("&");
            buffer.append("city=");
            buffer.append(city);
            buffer.append("&");
            buffer.append("prov=");
            buffer.append(province);
            buffer.append("&");
            buffer.append("Search=");
            out = new PrintWriter(connection.getOutputStream());
            out.print(buffer);
            out.close();
            parser = new Parser(connection);
        }
        catch (Exception e)
        {
            throw new ParserException(
                "You must be offline! This test needs you to be connected to the internet.",
                e);
        }


        pass = false;
        for (enumeration = parser.elements(); enumeration.hasMoreNodes();)
        {
            node = enumeration.nextNode();
            if (node instanceof StringNode)
            {
                string = (StringNode) node;

View Full Code Here

    public void testFile()
    {
        String path;
        File file;
        PrintWriter out;
        Parser parser;
        Node nodes[];
        int i;
        NodeIterator enumeration;


        path = System.getProperty("user.dir");
        if (!path.endsWith(File.separator))
            path += File.separator;
        file = new File(path + "delete_me.html");
        try
        {
            out = new PrintWriter(new FileWriter(file));
            out.println(
                "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
            out.println("<html>");
            out.println("<head>");
            out.println("<title>test</title>");
            out.println(
                "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
            out.println("</head>");
            out.println("<body>");
            out.println("This is a test page ");
            out.println("</body>");
            out.println("</html>");
            out.close();
            parser = new Parser(file.getAbsolutePath());
            nodes = new Node[30];
            i = 0;
            for (enumeration = parser.elements(); enumeration.hasMoreNodes();)
            {
                nodes[i] = enumeration.nextNode();
                i++;
            }
            assertEquals("Expected nodes", 12, i);

View Full Code Here

     * Here, ibm.co.jp is an example of a HTTP server that correctly sets the
     * charset in the header to match the content encoding.
     */
    public void testHTTPCharset()
    {
        Parser parser;
        try
        {
            parser = new Parser("http://www.ibm.com/jp/", Parser.noFeedback);
            assertTrue(
                "Character set should be Shift_JIS",
                parser.getEncoding().equalsIgnoreCase("Shift_JIS"));
        }
        catch (ParserException e)
        {
            fail("could not open http://www.ibm.com/jp/");
        }

View Full Code Here

     * charset in the header to match the content encoding. We check that after
     * the enumeration is created, that the charset has changed to the correct value.
     */
    public void testHTMLCharset()
    {
        Parser parser;
        NodeIterator enumeration;


        try
        {
            parser = new Parser("http://www.sony.co.jp", Parser.noFeedback);
            assertEquals(
                "Character set by default is ISO-8859-1",
                "ISO-8859-1",
                parser.getEncoding());
            enumeration = parser.elements();
            assertTrue(
                "Character set should be Shift_JIS",
                parser.getEncoding().equalsIgnoreCase("Shift_JIS"));
        }
        catch (ParserException e)
        {
            fail("could not open http://www.sony.co.jp");
        }

View Full Code Here

     * See bug #707447 META TAG - CHARSET
     * and bug #699886 can't parse website other than iso-8859-1
     */
    public void testSwitchCharset() throws ParserException
    {
        Parser parser;
        String url =
            "http://htmlparser.sourceforge.net/test/gb2312Charset.html";
        int i;
        Node[] nodes;


        parser = new Parser(url);
        i = 0;
        nodes = new Node[30];
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
            nodes[i++] = e.nextNode();
        assertEquals("Expected nodes", 14, i);
    }

View Full Code Here

    public LinkExtractor(String location)
    {
        this.location = location;
        try
        {
            this.parser = new Parser(location); // Create the parser object
            parser.registerScanners();
            // Register standard scanners (Very Important)
        }
        catch (ParserException e)
        {

View Full Code Here

     * header by a server-side web application.
     * Nonetheless, it would be nice to handle this case.
     */
    public void testDoubleQuotedCharset() throws ParserException
    {
        Parser parser;
        String url =
            "http://htmlparser.sourceforge.net/test/DoublequotedCharset.html";


        parser = new Parser(url);
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
            e.nextNode();
        assertTrue("Wrong encoding", parser.getEncoding().equals("UTF-8"));
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.htmlparser.Parser

cn.edu.pku.dr.requirement.elicitation.tools.HtmlTransformer

com.gnizr.core.util.FormatUtil

com.knowgate.hipermail.DBMimeMessage

com.knowgate.hipermail.HtmlMimeBodyPart

com.lanyuan.util.HttpClientUtils

com.vgo.movie.thread.DetailFilmThread

com.waxayaz.TomcatMI.core.utils.repoManager.TomcatRepositoryManager

com.wordpress.util.StringUtil

fitnesse.fixtures.PageDriver

fitnesse.slim.converters.MapEditor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.