Package org.apache.tika.parser.microsoft.OfficeParser

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType


        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                   
                    byte[] data = ole.getDataBuffer();
                    embedded = TikaInputStream.get(data);
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                } catch (Exception e) {
                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {
                try {
                   // Grab the contents and process
                   DocumentEntry contentsEntry;
                   try {
                     contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
                   } catch (FileNotFoundException ioe) {
                     contentsEntry = (DocumentEntry)dir.getEntry("Contents");
                   }
                   DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                   byte[] contents = new byte[contentsEntry.getSize()];
                   inp.readFully(contents);
                   embedded = TikaInputStream.get(contents);
                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
                   metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                   metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
                } catch(Exception e) {
                   throw new TikaException("Invalid embedded resource", e);
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
View Full Code Here


            Metadata metadata = new Metadata();
            TikaInputStream stream = null;
            metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
                  && root.hasEntry("\u0001Ole")
                  && root.hasEntry("\u0001CompObj")
                  && root.hasEntry("\u0003ObjInfo")) {
View Full Code Here

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                   
                    byte[] data = ole.getDataBuffer();
                    embedded = TikaInputStream.get(data);
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
View Full Code Here

        try {
            Metadata metadata = new Metadata();
            TikaInputStream stream = null;

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
                  && root.hasEntry("\u0001Ole")
                  && root.hasEntry("\u0001CompObj")
                  && root.hasEntry("\u0003ObjInfo")) {
View Full Code Here

           newFS.writeFilesystem(out);
           out.close();

           // What kind of document is it?
           Metadata metadata = new Metadata();
           POIFSDocumentType type = POIFSDocumentType.detectType(dir);

           TikaInputStream embedded;

           if (type==POIFSDocumentType.OLE10_NATIVE) {
               Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
               ByteArrayOutputStream bos = new ByteArrayOutputStream();
               IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
               byte[] data = bos.toByteArray();

               try {
                    Ole10Native ole = new Ole10Native(data, 0);
                    byte[] dataBuffer = ole.getDataBuffer();

                    metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());

                    embedded = TikaInputStream.get(dataBuffer);
               } catch (Ole10NativeException ex) {
                 embedded = TikaInputStream.get(data);
               }
           } else {
               metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
               metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());

               embedded = TikaInputStream.get(tmpFile);
           }

           try {
View Full Code Here

            Metadata metadata = new Metadata();
            TikaInputStream stream = null;
            metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
                  && root.hasEntry("\u0001Ole")
                  && root.hasEntry("\u0001CompObj")
                  && root.hasEntry("\u0003ObjInfo")) {
View Full Code Here

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
                    if (ole.getLabel() != null) {
                        metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                    }
                    byte[] data = ole.getDataBuffer();
                    embedded = TikaInputStream.get(data);
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                } catch (Exception e) {
                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {
                try {
                   // Grab the contents and process
                   DocumentEntry contentsEntry;
                   try {
                     contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
                   } catch (FileNotFoundException ioe) {
                     contentsEntry = (DocumentEntry)dir.getEntry("Contents");
                   }
                   DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                   byte[] contents = new byte[contentsEntry.getSize()];
                   inp.readFully(contents);
                   embedded = TikaInputStream.get(contents);
                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
                   metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                   metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
                } catch(Exception e) {
                   throw new TikaException("Invalid embedded resource", e);
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
View Full Code Here

                IOUtils.copy(stream, out);
                ret = out.toByteArray();
            } else {
                //try poifs
                POIFSDocumentType type = POIFSDocumentType.detectType(root);
                if (type == POIFSDocumentType.OLE10_NATIVE) {
                    try {
                        // Try to un-wrap the OLE10Native record:
                        Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                        ret = ole.getDataBuffer();
                    } catch (Ole10NativeException ex) {
                        // Not a valid OLE10Native record, skip it
                    }
                } else if (type == POIFSDocumentType.COMP_OBJ) {

                    DocumentEntry contentsEntry;
                    try {
                        contentsEntry = (DocumentEntry)root.getEntry("CONTENTS");
                    } catch (FileNotFoundException ioe) {
                        contentsEntry = (DocumentEntry)root.getEntry("Contents");
                    }

                    DocumentInputStream inp = null;
                    try {
                        inp = new DocumentInputStream(contentsEntry);
                        ret = new byte[contentsEntry.getSize()];
                        inp.readFully(ret);
                    } finally {
                        if (inp != null) {
                            inp.close();
                        }
                    }
                } else {

                    ByteArrayOutputStream out = new ByteArrayOutputStream();
                    is.reset();
                    IOUtils.copy(is, out);
                    ret = out.toByteArray();
                    metadata.set(Metadata.RESOURCE_NAME_KEY, "file_"+unknownFilenameCount.getAndIncrement() + "."+type.getExtension());
                    metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                }
            }
        } finally {
            if (fs != null) {
                fs.close();
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.