Examples of TemporaryResources


Examples of org.apache.tika.io.TemporaryResources

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
      
        PDDocument pdfDocument = null;
        TemporaryResources tmp = new TemporaryResources();
        //config from context, or default if not set via context
        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
        try {
            // PDFBox can process entirely in memory, or can use a temp file
            //  for unpacked / processed resources
            // Decide which to do based on if we're reading from a file or not already
            TikaInputStream tstream = TikaInputStream.cast(stream);
            if (tstream != null && tstream.hasFile()) {
                // File based, take that as a cue to use a temporary file
                RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
                if (localConfig.getUseNonSequentialParser() == true){
                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
                } else {
                    pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
                }
            } else {
                // Go for the normal, stream based in-memory parsing
                if (localConfig.getUseNonSequentialParser() == true){
                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer());
                } else {
                    pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
                }
            }
           
          
            if (pdfDocument.isEncrypted()) {
                String password = null;
               
                // Did they supply a new style Password Provider?
                PasswordProvider passwordProvider = context.get(PasswordProvider.class);
                if (passwordProvider != null) {
                   password = passwordProvider.getPassword(metadata);
                }
               
                // Fall back on the old style metadata if set
                if (password == null && metadata.get(PASSWORD) != null) {
                   password = metadata.get(PASSWORD);
                }
               
                // If no password is given, use an empty string as the default
                if (password == null) {
                   password = "";
                }
              
                try {
                    pdfDocument.decrypt(password);
                } catch (Exception e) {
                    // Ignore
                }
            }
            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
            extractMetadata(pdfDocument, metadata);
            PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
           
        } finally {
            if (pdfDocument != null) {
               pdfDocument.close();
            }
            tmp.dispose();
        }
        handler.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

        // Check if we have access to the document
        if (input == null) {
            return MediaType.OCTET_STREAM;
        }

        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(input, tmp);

            byte[] prefix = new byte[1024]; // enough for all known formats
            int length = tis.peek(prefix);

            MediaType type = detectArchiveFormat(prefix, length);
            if (PackageParser.isZipArchive(type)
                    && TikaInputStream.isTikaInputStream(input)) {
                return detectZipFormat(tis);
            } else if (!type.equals(MediaType.OCTET_STREAM)) {
                return type;
            } else {
                return detectCompressorFormat(prefix, length);
            }
        } finally {
            try {
                tmp.dispose();
            } catch (TikaException e) {
                // ignore
            }
        }
    }
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

                entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
            }
            if (extractor.shouldParseEmbedded(entrydata)) {
                // For detectors to work, we need a mark/reset supporting
                // InputStream, which ArchiveInputStream isn't, so wrap
                TemporaryResources tmp = new TemporaryResources();
                try {
                    TikaInputStream tis = TikaInputStream.get(archive, tmp);
                    extractor.parseEmbedded(tis, xhtml, entrydata, true);
                } finally {
                    tmp.dispose();
                }
            }
        } else if (name != null && name.length() > 0) {
            xhtml.element("p", name);
        }
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

        // Check if we have access to the document
        if (input == null) {
            return MediaType.OCTET_STREAM;
        }

        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(input, tmp);

            byte[] prefix = new byte[1024]; // enough for all known formats
            int length = tis.peek(prefix);

            MediaType type = detectArchiveFormat(prefix, length);
            if (PackageParser.isZipArchive(type)
                    && TikaInputStream.isTikaInputStream(input)) {
                return detectZipFormat(tis);
            } else if (!type.equals(MediaType.OCTET_STREAM)) {
                return type;
            } else {
                return detectCompressorFormat(prefix, length);
            }
        } finally {
            try {
                tmp.dispose();
            } catch (TikaException e) {
                // ignore
            }
        }
    }
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
      
        PDDocument pdfDocument = null;
        TemporaryResources tmp = new TemporaryResources();

        try {
            // PDFBox can process entirely in memory, or can use a temp file
            //  for unpacked / processed resources
            // Decide which to do based on if we're reading from a file or not already
            TikaInputStream tstream = TikaInputStream.cast(stream);
            if (tstream != null && tstream.hasFile()) {
               // File based, take that as a cue to use a temporary file
               RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
               pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
            } else {
               // Go for the normal, stream based in-memory parsing
               pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
          
            if (pdfDocument.isEncrypted()) {
                String password = null;
               
                // Did they supply a new style Password Provider?
                PasswordProvider passwordProvider = context.get(PasswordProvider.class);
                if (passwordProvider != null) {
                   password = passwordProvider.getPassword(metadata);
                }
               
                // Fall back on the old style metadata if set
                if (password == null && metadata.get(PASSWORD) != null) {
                   password = metadata.get(PASSWORD);
                }
               
                // If no password is given, use an empty string as the default
                if (password == null) {
                   password = "";
                }
              
                try {
                    pdfDocument.decrypt(password);
                } catch (Exception e) {
                    // Ignore
                }
            }
            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
            extractMetadata(pdfDocument, metadata);
            PDF2XHTML.process(pdfDocument, handler, metadata,
                              extractAnnotationText, enableAutoSpace,
                              suppressDuplicateOverlappingText, sortByPosition);

            extractEmbeddedDocuments(context, pdfDocument, handler);
        } finally {
            if (pdfDocument != null) {
               pdfDocument.close();
            }
            tmp.dispose();
        }
    }
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(stream, tmp);
            new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
            new JempboxExtractor(metadata).parse(tis);
        } finally {
            tmp.dispose();
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(stream, tmp);

            // Automatically detect the MIME type of the document
            MediaType type = detector.detect(tis, metadata);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());

            // TIKA-216: Zip bomb prevention
            SecureContentHandler sch = new SecureContentHandler(handler, tis);
            try {
                // Parse the document
                super.parse(tis, sch, metadata, context);
            } catch (SAXException e) {
                // Convert zip bomb exceptions to TikaExceptions
                sch.throwIfCauseOf(e);
                throw e;
            }
        } finally {
            tmp.dispose();
        }
    }
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
            }
            if (extractor.shouldParseEmbedded(entrydata)) {
                // For detectors to work, we need a mark/reset supporting
                // InputStream, which ArchiveInputStream isn't, so wrap
                TemporaryResources tmp = new TemporaryResources();
                try {
                    TikaInputStream tis = TikaInputStream.get(archive, tmp);
                    extractor.parseEmbedded(tis, xhtml, entrydata, true);
                } finally {
                    tmp.dispose();
                }
            }
        } else if (name != null && name.length() > 0) {
            xhtml.element("p", name);
        }
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

final class EmbeddedExtractor {

  public boolean parseEmbedded(InputStream stream, Record record, String name, Command child) {
    // Use the delegate parser to parse this entry
   
    TemporaryResources tmp = new TemporaryResources();
    try {
      final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
      if (stream instanceof TikaInputStream) {
        final Object container = ((TikaInputStream) stream).getOpenContainer();
        if (container != null) {
View Full Code Here

Examples of org.apache.tika.io.TemporaryResources

        Record entrydata = new Record(); // TODO: or pass myself?
        //Record entrydata = record.copy();
       
        // For detectors to work, we need a mark/reset supporting
        // InputStream, which ArchiveInputStream isn't, so wrap
        TemporaryResources tmp = new TemporaryResources();
        try {
          TikaInputStream tis = TikaInputStream.get(archive, tmp);
          return extractor.parseEmbedded(tis, entrydata, name, getChild());
        } finally {
          try {
            tmp.dispose();
          } catch (TikaException e) {
            LOG.warn("Cannot dispose of tmp Tika resources", e);
          }
        }
      } else {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.