Examples of TikaInputStream


Examples of org.apache.tika.io.TikaInputStream

       xhtml.endElement("img");

       // Have we already output this one?
       // (Only expose each individual image once)
       if(! pictures.hasOutput(picture)) {
          TikaInputStream stream = TikaInputStream.get(picture.getContent());
          handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
          pictures.recordOutput(picture);
       }
    }
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

              if (escherRecord instanceof EscherBSERecord) {
                 EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
                 if (blip != null) {
                    HSSFPictureData picture = new HSSFPictureData(blip);
                    String mimeType = picture.getMimeType();
                    TikaInputStream stream = TikaInputStream.get(picture.getData());
                   
                    // Handle the embeded resource
                    extractor.handleEmbeddedResource(
                          stream, null, null, mimeType,
                          handler, true
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

            if (extractor.shouldParseEmbedded(entrydata)) {
                // For detectors to work, we need a mark/reset supporting
                // InputStream, which ArchiveInputStream isn't, so wrap
                TemporaryResources tmp = new TemporaryResources();
                try {
                    TikaInputStream tis = TikaInputStream.get(archive, tmp);
                    extractor.parseEmbedded(tis, xhtml, entrydata, true);
                } finally {
                    tmp.dispose();
                }
            }
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

            System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);

            FileOutputStream os = new FileOutputStream(outputFile);

            if (inputStream instanceof TikaInputStream) {
                TikaInputStream tin = (TikaInputStream) inputStream;

                if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
                    POIFSFileSystem fs = new POIFSFileSystem();
                    copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
                    fs.writeFilesystem(os);
                } else {
                    IOUtils.copy(inputStream, os);
                }
            } else {
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

        IsoFile isoFile;
       
        // The MP4Parser library accepts either a File, or a byte array
        // As MP4 video files are typically large, always use a file to
        //  avoid OOMs that may occur with in-memory buffering
        TikaInputStream tstream = TikaInputStream.get(stream);
        try {
           isoFile = new IsoFile(tstream.getFileChannel());
        } finally {
           tstream.close();
        }
       
       
        // Grab the file type box
        FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

            return MediaType.OCTET_STREAM;
        }

        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(input, tmp);

            byte[] prefix = new byte[1024]; // enough for all known formats
            int length = tis.peek(prefix);

            MediaType type = detectArchiveFormat(prefix, length);
            if (PackageParser.isZipArchive(type)
                    && TikaInputStream.isTikaInputStream(input)) {
                return detectZipFormat(tis);
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

               attributes.addAttribute("", "class", "class", "CDATA", "embedded");
               attributes.addAttribute("", "id", "id", "CDATA", objID);
               xhtml.startElement("div", attributes);
               xhtml.endElement("div");

               TikaInputStream stream =
                    TikaInputStream.get(data.getData());
               try {
                  String mediaType = null;
                  if ("Excel.Chart.8".equals(oleShape.getProgID())) {
                     mediaType = "application/vnd.ms-excel";
                  }
                  handleEmbeddedResource(
                        stream, objID, objID,
                        mediaType, xhtml, false);
               } finally {
                  stream.close();
               }
            }
         }
      }
   }
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

        try {
            // PDFBox can process entirely in memory, or can use a temp file
            //  for unpacked / processed resources
            // Decide which to do based on if we're reading from a file or not already
            TikaInputStream tstream = TikaInputStream.cast(stream);
            if (tstream != null && tstream.hasFile()) {
               // File based, take that as a cue to use a temporary file
               RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
               pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
            } else {
               // Go for the normal, stream based in-memory parsing
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

                        metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
                        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
                        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));

                        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                            TikaInputStream stream = TikaInputStream.get(file.createInputStream());
                            try {
                                embeddedExtractor.parseEmbedded(
                                                                stream,
                                                                new EmbeddedContentHandler(handler),
                                                                metadata, false);
                            } finally {
                                stream.close();
                            }
                        }
                    }
                }
            }
View Full Code Here

Examples of org.apache.tika.io.TikaInputStream

        if (dir.hasEntry("Package")) {
            // It's OOXML (has a ZipFile):
            Entry ooxml = dir.getEntry("Package");

            TikaInputStream stream = TikaInputStream.get(
                    new DocumentInputStream((DocumentEntry) ooxml));
            try {
                ZipContainerDetector detector = new ZipContainerDetector();
                MediaType type = detector.detect(stream, new Metadata());
                handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
                return;
            } finally {
                stream.close();
            }
        }

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                   
                    byte[] data = ole.getDataBuffer();
                    embedded = TikaInputStream.get(data);
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {
                try {
                   // Grab the contents and process
                   DocumentEntry contentsEntry;
                   try {
                     contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
                   } catch (FileNotFoundException ioe) {
                     contentsEntry = (DocumentEntry)dir.getEntry("Contents");
                   }
                   DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                   byte[] contents = new byte[contentsEntry.getSize()];
                   inp.readFully(contents);
                   embedded = TikaInputStream.get(contents);
                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
                   metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                   metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
                } catch(Exception e) {
                   throw new TikaException("Invalid embedded resource", e);
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
                    // Make a TikaInputStream that just
                    // passes the root directory of the
                    // embedded document, and is otherwise
                    // empty (byte[0]):
                    embedded = TikaInputStream.get(new byte[0]);
                    embedded.setOpenContainer(dir);
                }
                extractor.parseEmbedded(embedded, xhtml, metadata, true);
            }
        } finally {
            if (embedded != null) {
                embedded.close();
            }
        }
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.