Package org.apache.tika.parser

Examples of org.apache.tika.parser.AutoDetectParser


      ParseContext parseContext = parseContextProvider.getParseContext( name, value );

      StringWriter writer = new StringWriter();
      WriteOutContentHandler contentHandler = new WriteOutContentHandler( writer );

      Parser parser = new AutoDetectParser();
      parser.parse( in, contentHandler, metadata, parseContext );
      luceneOptions.addFieldToDocument( name, writer.toString(), document );

      // allow for optional indexing of metadata by the user
      metadataProcessor.set( name, value, document, luceneOptions, metadata );
    }
View Full Code Here


    public JackrabbitParser() {
        InputStream stream =
            JackrabbitParser.class.getResourceAsStream("tika-config.xml");
        try {
            try {
                parser = new AutoDetectParser(new TikaConfig(stream));
            } finally {
                stream.close();
            }
        } catch (Exception e) {
            // Should never happen
View Full Code Here

        }

        if (forkJavaCommand != null) {
            ForkParser forkParser = new ForkParser(
                    SearchIndex.class.getClassLoader(),
                    new AutoDetectParser(config));
            forkParser.setJavaCommand(forkJavaCommand);
            forkParser.setPoolSize(extractorPoolSize);
            return forkParser;
        } else {
            return new AutoDetectParser(config);
        }
    }
View Full Code Here

      templateAdd.allowDups = true;
      templateAdd.overwriteCommitted = false;
      templateAdd.overwritePending = false;
    }
    //this is lightweight
    autoDetectParser = new AutoDetectParser(config);
    this.factory = factory;
   
    ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
  }
View Full Code Here

    public void testExcel() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/testEXCEL.xlsx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);
           
            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
View Full Code Here

    public void testPowerPoint() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/testPPT.pptx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testPPT.pptx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);
           
            assertEquals(
                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
View Full Code Here

    public void testWord() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/testWORD.docx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);
           
            assertEquals(
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
View Full Code Here

public class TikaDocumentParser implements Parser {

  @Override
  public Parse parse(ContentEntity entity, Link link) throws DroidsException,
      IOException {
    org.apache.tika.parser.Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    BodyContentHandler handler = new BodyContentHandler();
   
    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, handler, metadata, new ParseContext());
      ParseImpl parse = new ParseImpl(handler.toString(),null);
     
      return parse;

    } catch (SAXException ex) {
View Full Code Here

  }

  @Override
  public Parse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
    // Init Tika objects
    org.apache.tika.parser.Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
   
    String charset = entity.getCharset();
    if (charset == null) {
      charset = "UTF-8";
    }
    EchoHandler data = new EchoHandler(charset);
    LinkExtractor extractor = new LinkExtractor(link, elements);
   
    TeeContentHandler parallelHandler = new TeeContentHandler(data, extractor);

    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, parallelHandler, metadata);
     
      return new ParseImpl(data.toString(), extractor.getLinks());
    } catch (SAXException ex) {
      throw new DroidsException("Failure parsing document " + link.getId(), ex);
    } catch (TikaException ex) {
View Full Code Here

     * @return text from document without format
     */
    public static String getTextFromDocument(byte[] document) {
        String errMissingTika = JMeterUtils.getResString("view_results_response_missing_tika"); // $NON-NLS-1$
        String response = errMissingTika;
        Parser parser = new AutoDetectParser();
        ContentHandler handler = new BodyContentHandler(MAX_DOCUMENT_SIZE > 0 ? MAX_DOCUMENT_SIZE : -1); // -1 to disable the write limit
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        InputStream stream = new ByteArrayInputStream(document); // open the stream
        try {
            parser.parse(stream, handler, metadata, context);
            response = handler.toString();
        } catch (Exception e) {
            response = e.toString();
            log.warn("Error document parsing:", e);
        } catch (NoClassDefFoundError e) {
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.AutoDetectParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.