Examples of Tika


Examples of org.apache.tika.Tika

        out.println("    ports you specify as one or more arguments.");
        out.println();
    }

    private void version() {
        System.out.println(new Tika().toString());
    }
View Full Code Here

Examples of org.apache.tika.Tika

     * @throws MimeTypeException if the type can't be detected
     * @throws IOException if the file can't be read
     */
    public MimeType getMimeType(File file)
            throws MimeTypeException, IOException {
        return forName(new Tika(this).detect(file));
    }
View Full Code Here

Examples of org.apache.tika.Tika

                bundle(new File(base, "tika-bundle.jar").toURI().toURL().toString()));
    }
    //@Test
    public void testTikaBundle(BundleContext bc) throws Exception {
        Tika tika = new Tika();

        // Simple type detection
        assertEquals("text/plain", tika.detect("test.txt"));
        assertEquals("application/pdf", tika.detect("test.pdf"));

        // Simple text extraction
        String xml = tika.parseToString(new File("pom.xml"));
        assertTrue(xml.contains("tika-bundle"));

        // Package extraction
        ContentHandler handler = new BodyContentHandler();

        Parser parser = tika.getParser();
        ParseContext context = new ParseContext();
        context.set(Parser.class, parser);

        InputStream stream =
                new FileInputStream("src/test/resources/test-documents.zip");
View Full Code Here

Examples of org.apache.tika.Tika

        File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf");
        Metadata metadata = new Metadata();
        InputStream stream = TikaInputStream.get(file, metadata);

        // Test w/ default limit:
        Tika localTika = new Tika();
        String content = localTika.parseToString(stream, metadata);
        // parseToString closes for convenience:
        //stream.close();
        assertTrue(content.length() > 500);

        // Test setting max length on the instance:
        localTika.setMaxStringLength(200);
        stream = TikaInputStream.get(file, metadata);
        content = localTika.parseToString(stream, metadata);
       
        // parseToString closes for convenience:
        //stream.close();
        assertTrue(content.length() <= 200);
       
        // Test setting max length per-call:
        stream = TikaInputStream.get(file, metadata);
        content = localTika.parseToString(stream, metadata, 100);
        // parseToString closes for convenience:
        //stream.close();
        assertTrue(content.length() <= 100);
    }
View Full Code Here

Examples of org.apache.tika.Tika

        .create(endPoint + VERSION_PATH)
        .type("text/plain")
        .accept("text/plain")
        .get();

    assertEquals(new Tika().toString(),
        getStringFromInputStream((InputStream) response.getEntity()));
  }
View Full Code Here

Examples of org.apache.tika.Tika

    @Test
    @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
    public void XtestParseUTF8() throws IOException, SAXException, TikaException {
        String path = "/test-documents/testXHTML_utf8.html";
        Metadata metadata = new Metadata();
        String content = new Tika().parseToString(
                HtmlParserTest.class.getResourceAsStream(path), metadata);

        assertTrue("Did not contain expected text:"
                + "Title : Tilte with UTF-8 chars √∂√§√•", content
                .contains("Title : Tilte with UTF-8 chars √∂√§√•"));
View Full Code Here

Examples of org.apache.tika.Tika

    @Test
    public void testXhtmlParsing() throws Exception {
        String path = "/test-documents/testXHTML.html";
        Metadata metadata = new Metadata();
        String content = new Tika().parseToString(
                HtmlParserTest.class.getResourceAsStream(path), metadata);

        assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
View Full Code Here

Examples of org.apache.tika.Tika

     * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
     */
    @Test
    public void testCharactersDirectlyUnderBodyElement() throws Exception {
        String test = "<html><body>test</body></html>";
        String content = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("UTF-8")));
        assertEquals("test", content);
    }
View Full Code Here

Examples of org.apache.tika.Tika

     */
    @Test
    public void testWhitespaceBetweenTableCells() throws Exception {
        String test =
            "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
        String content = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("UTF-8")));
        assertTrue(content.contains("a"));
        assertTrue(content.contains("b"));
        assertFalse(content.contains("ab"));
    }
View Full Code Here

Examples of org.apache.tika.Tika

     * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
     */
    @Test
    public void testLineBreak() throws Exception {
        String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
        String text = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("US-ASCII")));
        String[] parts = text.trim().split("\\s+");
        assertEquals(3, parts.length);
        assertEquals("foo", parts[0]);
        assertEquals("bar", parts[1]);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.