Examples of POIFSDocumentType


Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

           newFS.writeFilesystem(out);
           out.close();

           // What kind of document is it?
           Metadata metadata = new Metadata();
           POIFSDocumentType type = POIFSDocumentType.detectType(dir);
           metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());

           // Trigger for the document itself
           TikaInputStream embedded = TikaInputStream.get(tmpFile);
           try {
               if (extractor.shouldParseEmbedded(metadata)) {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

         POIFSFileSystem fs =
             new POIFSFileSystem(new FileInputStream(stream.getFile()));
         stream.setOpenContainer(fs);

         // See if it's one of the Microsoft Office file formats?
         POIFSDocumentType type = POIFSDocumentType.detectType(fs);
         if(type != POIFSDocumentType.UNKNOWN) {
            return type.getType();
         }
        
         // Is it one of the Corel formats which use OLE2?
         MediaType mt = detectCorel(fs.getRoot());
         if(mt != null) return mt;
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

            Metadata metadata = new Metadata();
            TikaInputStream stream = null;
            metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
                  && root.hasEntry("\u0001Ole")
                  && root.hasEntry("\u0001CompObj")
                  && root.hasEntry("\u0003ObjInfo")) {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                   
                    byte[] data = ole.getDataBuffer();
                    embedded = TikaInputStream.get(data);
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {
                try {
                   // Grab the contents and process
                   DocumentEntry contentsEntry;
                   try {
                     contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
                   } catch (FileNotFoundException ioe) {
                     contentsEntry = (DocumentEntry)dir.getEntry("Contents");
                   }
                   DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                   byte[] contents = new byte[contentsEntry.getSize()];
                   inp.readFully(contents);
                   embedded = TikaInputStream.get(contents);
                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
                   metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                   metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
                } catch(Exception e) {
                   throw new TikaException("Invalid embedded resource", e);
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

           newFS.writeFilesystem(out);
           out.close();

           // What kind of document is it?
           Metadata metadata = new Metadata();
           POIFSDocumentType type = POIFSDocumentType.detectType(dir);

           TikaInputStream embedded;

           if (type==POIFSDocumentType.OLE10_NATIVE) {
               Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
               ByteArrayOutputStream bos = new ByteArrayOutputStream();
               IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
               byte[] data = bos.toByteArray();

               try {
                    Ole10Native ole = new Ole10Native(data, 0);
                    byte[] dataBuffer = ole.getDataBuffer();

                    metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());

                    embedded = TikaInputStream.get(dataBuffer);
               } catch (Ole10NativeException ex) {
                 embedded = TikaInputStream.get(data);
               }
           } else {
               metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
               metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());

               embedded = TikaInputStream.get(tmpFile);
           }

           try {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                   
                    byte[] data = ole.getDataBuffer();
                    embedded = TikaInputStream.get(data);
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {
                try {
                   // Grab the contents and process
                   DocumentEntry contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
                   DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                   byte[] contents = new byte[contentsEntry.getSize()];
                   inp.readFully(contents);
                   embedded = TikaInputStream.get(contents);
                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
                   metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                   metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
                } catch(Exception e) {
                   throw new TikaException("Invalid embedded resource", e);
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

        try {
            Metadata metadata = new Metadata();
            TikaInputStream stream = null;

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
                  && root.hasEntry("\u0001Ole")
                  && root.hasEntry("\u0001CompObj")
                  && root.hasEntry("\u0003ObjInfo")) {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

        try {
            Metadata metadata = new Metadata();
            TikaInputStream stream = null;

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
                  && root.hasEntry("\u0001Ole")
                  && root.hasEntry("\u0001CompObj")
                  && root.hasEntry("\u0003ObjInfo")) {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

            Metadata metadata = new Metadata();
            TikaInputStream stream = null;
            metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
                  && root.hasEntry("\u0001Ole")
                  && root.hasEntry("\u0001CompObj")
                  && root.hasEntry("\u0003ObjInfo")) {
View Full Code Here

Examples of org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
                ByteArrayOutputStream bos = new ByteArrayOutputStream();

                // TODO: once we upgrade to POI 3.8 beta 5
                // we can avoid this full copy/serialize by
                // passing the DirectoryNode instead:
                IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
                byte[] data = bos.toByteArray();

                try {
                    // Maybe unwrap OLE10Native record:
                    Ole10Native ole = new Ole10Native(data, 0);
                    data = ole.getDataBuffer();
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                } catch (Ole10NativeException ex) {
                    // Not an OLE10Native record
                }
                embedded = TikaInputStream.get(data);
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.