Package org.apache.tika.sax

Examples of org.apache.tika.sax.XHTMLContentHandler


            metadata.set(TikaCoreProperties.TITLE, title);
            metadata.set(TikaCoreProperties.DESCRIPTION, description);
            // store the other fields in the metadata

            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();

            xhtml.element("h1", title);
            xhtml.element("p", description);

            xhtml.startElement("ul");
            for (Object e : feed.getEntries()) {
                SyndEntry entry = (SyndEntry) e;
                String link = entry.getLink();
                if (link != null) {
                    xhtml.startElement("li");
                    xhtml.startElement("a", "href", link);
                    xhtml.characters(stripTags(entry.getTitleEx()));
                    xhtml.endElement("a");
                    SyndContent content = entry.getDescription();
                    if (content != null) {
                        xhtml.newline();
                        xhtml.characters(content.getValue());
                    }
                    xhtml.endElement("li");
                }
            }
            xhtml.endElement("ul");

            xhtml.endDocument();
        } catch (FeedException e) {
            throw new TikaException("RSS parse error", e);
        }

    }
View Full Code Here


            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
        metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // Create handlers for the various kinds of ID3 tags
        ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);

        if (audioAndTags.tags.length > 0) {
           CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);

           metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
           metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
           metadata.set(XMPDM.ARTIST, tag.getArtist());
           metadata.set(XMPDM.COMPOSER, tag.getComposer());
           metadata.set(XMPDM.ALBUM, tag.getAlbum());
           metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
           metadata.set(XMPDM.GENRE, tag.getGenre());
          
           List<String> comments = new ArrayList<String>();
           for (ID3Comment comment : tag.getComments()) {
              StringBuffer cmt = new StringBuffer();
              if (comment.getLanguage() != null) {
                 cmt.append(comment.getLanguage());
                 cmt.append(" - ");
              }
              if (comment.getDescription() != null) {
                 cmt.append(comment.getDescription());
                 if (comment.getText() != null) {
                    cmt.append("\n");
                 }
              }
              if (comment.getText() != null) {
                 cmt.append(comment.getText());
              }
             
              comments.add(cmt.toString());
              metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
           }

           xhtml.element("h1", tag.getTitle());
           xhtml.element("p", tag.getArtist());

            // ID3v1.1 Track addition
            if (tag.getTrackNumber() != null) {
                xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber());
                metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
            } else {
                xhtml.element("p", tag.getAlbum());
            }
            xhtml.element("p", tag.getYear());
            xhtml.element("p", tag.getGenre());
            for (String comment : comments) {
               xhtml.element("p", comment);
            }
        }
        if (audioAndTags.audio != null) {
            metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
            metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
            metadata.set("version", audioAndTags.audio.getVersion());
           
            metadata.set(
                    XMPDM.AUDIO_SAMPLE_RATE,
                    Integer.toString(audioAndTags.audio.getSampleRate()));
            if(audioAndTags.audio.getChannels() == 1) {
               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
            } else if(audioAndTags.audio.getChannels() == 2) {
               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
            } else if(audioAndTags.audio.getChannels() == 5) {
               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
            } else if(audioAndTags.audio.getChannels() == 7) {
               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
            }
        }
        if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
           xhtml.startElement("p", "class", "lyrics");
           xhtml.characters(audioAndTags.lyrics.lyricsText);
           xhtml.endElement("p");
        }

        xhtml.endDocument();
    }
View Full Code Here

        // Use the delegate parser to parse the contained document
        EmbeddedDocumentExtractor extractor = context.get(
                EmbeddedDocumentExtractor.class,
                new ParsingEmbeddedDocumentExtractor(context));

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        try {
            ArchiveEntry entry = ais.getNextEntry();
            while (entry != null) {
                if (!entry.isDirectory()) {
                    parseEntry(ais, entry, extractor, xhtml);
                }
                entry = ais.getNextEntry();
            }
        } finally {
            ais.close();
        }

        xhtml.endDocument();
    }
View Full Code Here

            entryStream.mark(4096);
            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
            entryStream.reset();
           
            if(type != null) {
               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
               ContentHandler contentHandler;
              
               switch(type) {
               case KEYNOTE:
                  contentHandler = new KeynoteContentHandler(xhtml, metadata);
                  break;
               case NUMBERS:
                  contentHandler = new NumbersContentHandler(xhtml, metadata);
                  break;
               case PAGES:
                  contentHandler = new PagesContentHandler(xhtml, metadata);
                  break;
               case ENCRYPTED:
                   // We can't do anything for the file right now
                   contentHandler = null;
                   break;
               default:
                  throw new TikaException("Unhandled iWorks file " + type);
               }

               metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
               xhtml.startDocument();
               if (contentHandler != null) {
                  context.getSAXParser().parse(
                          new CloseShieldInputStream(entryStream),
                          new OfflineContentHandler(contentHandler)
                  );
               }
               xhtml.endDocument();
            }
           
            entry = zip.getNextZipEntry();
        }
        zip.close();
View Full Code Here

        MediaType type = getMediaType(cis);
        if (!type.equals(MediaType.OCTET_STREAM)) {
            metadata.set(CONTENT_TYPE, type.toString());
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        try {
            Metadata entrydata = new Metadata();
            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
            if (name != null) {
                if (name.endsWith(".tbz")) {
                    name = name.substring(0, name.length() - 4) + ".tar";
                } else if (name.endsWith(".tbz2")) {
                    name = name.substring(0, name.length() - 5) + ".tar";
                } else if (name.endsWith(".bz")) {
                    name = name.substring(0, name.length() - 3);
                } else if (name.endsWith(".bz2")) {
                    name = name.substring(0, name.length() - 4);
                } else if (name.endsWith(".xz")) {
                    name = name.substring(0, name.length() - 3);
                } else if (name.endsWith(".pack")) {
                    name = name.substring(0, name.length() - 5);
                } else if (name.length() > 0) {
                    name = GzipUtils.getUncompressedFilename(name);
                }
                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
            }

            // Use the delegate parser to parse the compressed document
            EmbeddedDocumentExtractor extractor = context.get(
                    EmbeddedDocumentExtractor.class,
                    new ParsingEmbeddedDocumentExtractor(context));
            if (extractor.shouldParseEmbedded(entrydata)) {
                extractor.parseEmbedded(cis, xhtml, entrydata, true);
            }
        } finally {
            cis.close();
        }

        xhtml.endDocument();
    }
View Full Code Here

           throws IOException, SAXException, TikaException {

        HashMap<String,String> properties = this.loadProperties(stream);
        this.setMetadata(metadata, properties);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        // TODO: put body content here
        xhtml.startElement("p");
        String body = clean(properties.get("body"));
        if (body != null)
           xhtml.characters(body);
        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here

           // Bail out
           return;
        }

       
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
       
       
        // Pull out some information from the header box
        MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
        if (mHeader != null) {
           // Get the creation and modification dates
           metadata.set(
                 Metadata.CREATION_DATE,
                 MP4TimeToDate(mHeader.getCreationTime())
           );
           metadata.set(
                 TikaCoreProperties.MODIFIED,
                 MP4TimeToDate(mHeader.getModificationTime())
           );
          
           // Get the duration
           double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale();
           // TODO Use this
          
           // The timescale is normally the sampling rate
           metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale());
        }
       
       
        // Get some more information from the track header
        // TODO Decide how to handle multiple tracks
        List<TrackBox> tb = moov.getBoxes(TrackBox.class);
        if (tb.size() > 0) {
           TrackBox track = tb.get(0);
          
           TrackHeaderBox header = track.getTrackHeaderBox();
           // Get the creation and modification dates
           metadata.set(
                 TikaCoreProperties.CREATED,
                 MP4TimeToDate(header.getCreationTime())
           );
           metadata.set(
                 TikaCoreProperties.MODIFIED,
                 MP4TimeToDate(header.getModificationTime())
           );
          
           // Get the video with and height
           metadata.set(Metadata.IMAGE_WIDTH,  (int)header.getWidth());
           metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight());
          
           // Get the sample information
           SampleTableBox samples = track.getSampleTableBox();
           SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
           if (sampleDesc != null) {
              // Look for the first Audio Sample, if present
              AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
              if (sample != null) {
                 XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
                 //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
                 metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate());
                 //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
                 //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
              }
           }
        }
       
        // Get metadata from the User Data Box
        UserDataBox userData = getOrNull(moov, UserDataBox.class);
        if (userData != null) {
           MetaBox meta = getOrNull(userData, MetaBox.class);

           // Check for iTunes Metadata
           // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
           //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
           AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
           if (apple != null) {
              // Title
              AppleTrackTitleBox title = getOrNull(apple, AppleTrackTitleBox.class);
              addMetadata(TikaCoreProperties.TITLE, metadata, title);

              // Artist
              AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
              addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
              addMetadata(XMPDM.ARTIST, metadata, artist);
             
              // Album
              AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
              addMetadata(XMPDM.ALBUM, metadata, album);
             
              // Composer
              AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
              addMetadata(XMPDM.COMPOSER, metadata, composer);
             
              // Genre
              AppleStandardGenreBox sGenre = getOrNull(apple, AppleStandardGenreBox.class);
              AppleCustomGenreBox   cGenre = getOrNull(apple, AppleCustomGenreBox.class);
              addMetadata(XMPDM.GENRE, metadata, sGenre);
              addMetadata(XMPDM.GENRE, metadata, cGenre);
             
              // Year
              AppleRecordingYearBox year = getOrNull(apple, AppleRecordingYearBox.class);
              addMetadata(XMPDM.RELEASE_DATE, metadata, year);
             
              // Track number
              AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
              if (trackNum != null) {
                 metadata.set(XMPDM.TRACK_NUMBER, trackNum.getTrackNumber());
                 //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getNumberOfTracks()); // TODO
              }
             
              // Comment
              AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
              addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
             
              // Encoder
              AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
              // addMetadata(XMPDM.???, metadata, encoder); // TODO
             
             
              // As text
              for (Box box : apple.getBoxes()) {
                 if (box instanceof AbstractAppleMetaDataBox) {
                    xhtml.element("p", ((AbstractAppleMetaDataBox)box).getValue());
                 }
              }
           }
          
           // TODO Check for other kinds too
        }

        // All done
        xhtml.endDocument();
    }
View Full Code Here

            }
        } catch (IOException e) {
            throw new TikaException("NetCDF parse error", e);
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

            new JempboxExtractor(metadata).parse(tis);
        } finally {
            tmp.dispose();
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

        }
    }

    public HtmlHandler(
            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.XHTMLContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.