Source Code of org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.usermodel.BodyType;
import org.apache.poi.xwpf.usermodel.IBody;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFStyle;
import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
import org.xml.sax.SAXException;


public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
    private XWPFDocument document;
    private XWPFStyles styles;


    public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
        super(context, extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
        
        document = (XWPFDocument) extractor.getDocument();
        styles = document.getStyles();
    }


    /**
     * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
     */
    @Override
    protected void buildXHTML(XHTMLContentHandler xhtml)
            throws SAXException, XmlException, IOException {
        XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();


        // headers
        extractHeaders(xhtml, hfPolicy);


        // process text in the order that it occurs in
        extractIBodyText(document, xhtml);


        // then all document tables
        extractFooters(xhtml, hfPolicy);
    }


    private void extractIBodyText(IBody bodyElement, XHTMLContentHandler xhtml)
            throws SAXException, XmlException, IOException {
       for(IBodyElement element : bodyElement.getBodyElements()) {
          if(element instanceof XWPFParagraph) {
             XWPFParagraph paragraph = (XWPFParagraph)element;
             extractParagraph(paragraph, xhtml);
          }
          if(element instanceof XWPFTable) {
             XWPFTable table = (XWPFTable)element;
             extractTable(table, xhtml);
          }
      }
    }
    
    private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler xhtml)
            throws SAXException, XmlException, IOException {
       // If this paragraph is actually a whole new section, then
       //  it could have its own headers and footers
       // Check and handle if so
       XWPFHeaderFooterPolicy headerFooterPolicy = null;
       if (paragraph.getCTP().getPPr() != null) {
           CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
           if(ctSectPr != null) {
              headerFooterPolicy =
                  new XWPFHeaderFooterPolicy(document, ctSectPr);
              extractHeaders(xhtml, headerFooterPolicy);
           }
       }
       
       // Is this a paragraph, or a heading?
       String tag = "p";
       String styleClass = null;
       if(paragraph.getStyleID() != null) {
          XWPFStyle style = styles.getStyle(
                paragraph.getStyleID()
          );
          
          TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
                style.getName(), paragraph.getPartType() == BodyType.TABLECELL
          );
          tag = tas.getTag();
          styleClass = tas.getStyleClass();
       }
       
       if(styleClass == null) {
          xhtml.startElement(tag);
       } else {
          xhtml.startElement(tag, "class", styleClass);
       }
       
       // Attach bookmarks for the paragraph
       // (In future, we might put them in the right place, for now
       //  we just put them in the correct paragraph)
       for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
          xhtml.startElement("a", "name", bookmark.getName());
          xhtml.endElement("a");
       }
       
       // Do the text
       for(XWPFRun run : paragraph.getRuns()) {
          List<String> tags = new ArrayList<String>();
          if(run instanceof XWPFHyperlinkRun) {
             XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
             XWPFHyperlink link = linkRun.getHyperlink(document);
             if(link != null && link.getURL() != null) {
                xhtml.startElement("a", "href", link.getURL());
                tags.add("a");
             } else if(linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
                xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
                tags.add("a");
             }
          }
          if(run.isBold()) {
             xhtml.startElement("b");
             tags.add("b");
          }
          if(run.isItalic()) {
             xhtml.startElement("i");
             tags.add("i");
          }
          
          xhtml.characters(run.toString());
          
          for(int i=tags.size()-1; i>=0; i--) {
             xhtml.endElement(tags.get(i));
          }
          
          // If we have any pictures, output them
          for(XWPFPicture picture : run.getEmbeddedPictures()) {
             if(paragraph.getDocument() != null) {
                XWPFPictureData data = picture.getPictureData();
                if(data != null) {
                   xhtml.startElement("img", "src", "embedded:" + data.getFileName());
                   xhtml.endElement("img");
                }
             }
          }
       }
       
       // Now do any comments for the paragraph
       XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
       String commentText = comments.getCommentText();
       if(commentText != null && commentText.length() > 0) {
          xhtml.characters(commentText);
       }


       String footnameText = paragraph.getFootnoteText();
       if(footnameText != null && footnameText.length() > 0) {
          xhtml.characters(footnameText + "\n");
       }


       // Finish this paragraph
       xhtml.endElement(tag);


       if (headerFooterPolicy != null) {
           extractFooters(xhtml, headerFooterPolicy);
       }
    }


    private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
            throws SAXException, XmlException, IOException {
       xhtml.startElement("table");
       xhtml.startElement("tbody");
       for(XWPFTableRow row : table.getRows()) {
          xhtml.startElement("tr");
          for(XWPFTableCell cell : row.getTableCells()) {
             xhtml.startElement("td");
             extractIBodyText(cell, xhtml);
             xhtml.endElement("td");
          }
          xhtml.endElement("tr");
       }
       xhtml.endElement("tbody");
       xhtml.endElement("table");
    }
    
    private void extractFooters(
            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
            throws SAXException {
        // footers
        if (hfPolicy.getFirstPageFooter() != null) {
            xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
        }
        if (hfPolicy.getEvenPageFooter() != null) {
            xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
        }
        if (hfPolicy.getDefaultFooter() != null) {
            xhtml.element("p", hfPolicy.getDefaultFooter().getText());
        }
    }


    private void extractHeaders(
            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
            throws SAXException {
        if (hfPolicy.getFirstPageHeader() != null) {
            xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
        }
        if (hfPolicy.getEvenPageHeader() != null) {
            xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
        }
        if (hfPolicy.getDefaultHeader() != null) {
            xhtml.element("p", hfPolicy.getDefaultHeader().getText());
        }
    }


    /**
     * Word documents are simple, they only have the one
     *  main part
     */
    @Override
    protected List<PackagePart> getMainDocumentParts() {
       List<PackagePart> parts = new ArrayList<PackagePart>();
       parts.add( document.getPackagePart() );
       return parts;
    }
}
Source Code of org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator

Related Classes of org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator