Package org.apache.tika.parser.microsoft.ooxml

Source Code of org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;

import java.io.IOException;
import java.util.Iterator;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.xml.sax.SAXException;

public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {

    public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
        super(extractor);
    }

    /**
     * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
     */
    @Override
    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
            XmlException, IOException {
        XWPFDocument document = (XWPFDocument) extractor.getDocument();
        XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();

        // headers
        if (hfPolicy.getFirstPageHeader() != null) {
            xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
        }
        if (hfPolicy.getEvenPageHeader() != null) {
            xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
        }
        if (hfPolicy.getDefaultHeader() != null) {
            xhtml.element("p", hfPolicy.getDefaultHeader().getText());
        }

        // first all paragraphs
        Iterator<XWPFParagraph> i = document.getParagraphsIterator();
        while (i.hasNext()) {
            XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
                    new XWPFHyperlinkDecorator(i.next(), null, true));
            xhtml.element("p", decorator.getText());
        }

        // then all document tables
        extractTableContent(document.getDocument().getBody().getTblArray(),
                xhtml);

        // footers
        if (hfPolicy.getFirstPageFooter() != null) {
            xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
        }
        if (hfPolicy.getEvenPageFooter() != null) {
            xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
        }
        if (hfPolicy.getDefaultFooter() != null) {
            xhtml.element("p", hfPolicy.getDefaultFooter().getText());
        }
    }

    /**
     * Low level structured parsing of document tables.
     */
    private void extractTableContent(CTTbl[] tables, XHTMLContentHandler xhtml)
            throws SAXException {
        for (CTTbl table : tables) {
            xhtml.startElement("table");
            xhtml.startElement("tbody");
            CTRow[] rows = table.getTrArray();
            for (CTRow row : rows) {
                xhtml.startElement("tr");
                CTTc[] cells = row.getTcArray();
                for (CTTc tc : cells) {
                    xhtml.startElement("td");
                    CTP[] content = tc.getPArray();
                    for (CTP ctp : content) {
                        CTR[] inner = ctp.getRArray();
                        for (CTR ctr : inner) {
                            CTText[] text = ctr.getTArray();
                            for (CTText textContent : text) {
                                xhtml.characters(textContent.getStringValue());
                            }
                        }
                    }
                    xhtml.endElement("td");
                }
                xhtml.endElement("tr");
            }
            xhtml.endElement("tbody");
            xhtml.endElement("table");
        }
    }
}
TOP

Related Classes of org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.