Examples of ParseContext


Examples of org.apache.tika.parser.ParseContext

                    }
                }
            };
            new HtmlParser().parse(
                    stream, new TeeContentHandler(body, link),
                    metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

    public void testParseEmpty() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        new HtmlParser().parse(
                new ByteArrayInputStream(new byte[0]),
                handler,  new Metadata(), new ParseContext());
        assertEquals("", handler.toString());
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

                            links.add(atts.getValue("", "href"));
                        }
                    }
                },
                new Metadata(),
                new ParseContext());
        assertEquals(1, links.size());
        assertEquals(url, links.get(0));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

        String test =
            "<html><head><title>\u017d</title></head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("\u017d", metadata.get(Metadata.TITLE));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

            + "<body></body></html>";

        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

        String test = "<html><title>Simple Content</title><body></body></html>";
        Metadata metadata = new Metadata();
        metadata.add(Metadata.CONTENT_LANGUAGE, "en");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test1.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));

        // Some HTML pages have errors like ';;' versus '; ' as separator
        String test2 =
            "<html><head><meta http-equiv=\"content-type\""
            + " content=\"text/html;;charset=ISO-8859-1\" />"
            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

            + "<body></body></html>";

        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
        String path = "/test-documents/big-preamble.html";
        Metadata metadata = new Metadata();
        new HtmlParser().parse(
                HtmlParserTest.class.getResourceAsStream(path),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.