Package org.apache.tika.parser

Examples of org.apache.tika.parser.ParseContext


        xhtml.endDocument();
    }

    public void parse(InputStream stream, ContentHandler handler,
            Metadata metadata) throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }
View Full Code Here


        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());
        String content = writer.toString();

        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
        assertEquals("en", metadata.get(Metadata.LANGUAGE));
View Full Code Here

        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        assertTrue(handler.toString().contains(text));
    }
View Full Code Here

    public void testEmptyText() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("\n", handler.toString());
    }
View Full Code Here

        final String test2 = "the name is \u00e1ndre";

        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
       
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
       
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

        final String test2 = "the name is \u00e1ndre";

        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

     * @deprecated This method will be removed in Apache Tika 1.0.
     */
    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }
View Full Code Here

            public void ignorableWhitespace(char[] ch, int off, int len) {
                // Ignore the whitespace added by XHTMLContentHandler
            }
        };
        Metadata metadata = new Metadata();
        parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(msg, expected, handler.toString());
    }
View Full Code Here

        Metadata metadata = new Metadata();
        metadata.set(Metadata.LANGUAGE, "en");

        parser.parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("en", metadata.get(Metadata.LANGUAGE));
    }
View Full Code Here

     * @deprecated This method will be removed in Apache Tika 1.0.
     */
    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.ParseContext

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.