/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.rtf;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import javax.swing.text.*;
import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* RTF parser
*/
public class RTFParser implements Parser {
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.singleton(MediaType.application("rtf"));
private static final Pattern F_PATTERN = Pattern.compile("\\\\a?f([0-9]+)");
private static final Pattern FCHARSET_PATTERN = Pattern
.compile("\\\\fcharset[0-9]+");
private static final Pattern ANSICPG_PATTERN = Pattern
.compile("\\\\ansicpg[0-9]+");
private static final Pattern DEFAULT_FONT_PATTERN = Pattern.compile("\\\\deff(0-9)+");
private static final Pattern FONT_FAMILY_PATTERN = Pattern.compile("\\\\f(nil|roman|swiss|modern|script|decor|tech|bidi)");
private static Map<Integer, String> FONTSET_MAP = new HashMap<Integer, String>();
static {
FONTSET_MAP.put(0, "windows-1251"); // ANSI
// charset 1 is Default
// charset 2 is Symbol
FONTSET_MAP.put(77, "MacRoman"); // Mac Roman
FONTSET_MAP.put(78, "Shift_JIS"); // Mac Shift Jis
FONTSET_MAP.put(79, "ms949"); // Mac Hangul
FONTSET_MAP.put(80, "GB2312"); // Mac GB2312
FONTSET_MAP.put(81, "Big5"); // Mac Big5
FONTSET_MAP.put(82, "johab"); // Mac Johab (old)
FONTSET_MAP.put(83, "MacHebrew"); // Mac Hebrew
FONTSET_MAP.put(84, "MacArabic"); // Mac Arabic
FONTSET_MAP.put(85, "MacGreek"); // Mac Greek
FONTSET_MAP.put(86, "MacTurkish"); // Mac Turkish
FONTSET_MAP.put(87, "MacThai"); // Mac Thai
FONTSET_MAP.put(88, "cp1250"); // Mac East Europe
FONTSET_MAP.put(89, "cp1251"); // Mac Russian
FONTSET_MAP.put(128, "MS932"); // Shift JIS
FONTSET_MAP.put(129, "ms949"); // Hangul
FONTSET_MAP.put(130, "ms1361"); // Johab
FONTSET_MAP.put(134, "ms936"); // GB2312
FONTSET_MAP.put(136, "ms950"); // Big5
FONTSET_MAP.put(161, "cp1253"); // Greek
FONTSET_MAP.put(162, "cp1254"); // Turkish
FONTSET_MAP.put(163, "cp1258"); // Vietnamese
FONTSET_MAP.put(177, "cp1255"); // Hebrew
FONTSET_MAP.put(178, "cp1256"); // Arabic
// FONTSET_MAP.put( 179, "" ); // Arabic Traditional
// FONTSET_MAP.put( 180, "" ); // Arabic user
// FONTSET_MAP.put( 181, "" ); // Hebrew user
FONTSET_MAP.put(186, "cp1257"); // Baltic
FONTSET_MAP.put(204, "cp1251"); // Russian
FONTSET_MAP.put(222, "ms874"); // Thai
FONTSET_MAP.put(238, "cp1250"); // Eastern European
FONTSET_MAP.put(254, "cp437"); // PC 437
FONTSET_MAP.put(255, "cp850"); // OEM
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
File tempFile = null;
InputStream in = null;
try {
tempFile = createUnicodeRtfTempFile(stream);
in = new FileInputStream(tempFile);
Document sd = new CustomStyledDocument();
new RTFEditorKit().read(in, sd, 0);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
xhtml.startDocument();
xhtml.element("p", sd.getText(0, sd.getLength()));
xhtml.endDocument();
} catch (BadLocationException e) {
throw new TikaException("Error parsing an RTF document", e);
} finally {
IOUtils.closeQuietly(in);
if (tempFile != null) {
tempFile.delete();
}
}
}
/**
* @deprecated This method will be removed in Apache Tika 1.0.
*/
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata) throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
private String escapeByUnicode(String data, String enc) {
StringBuilder dataBuf = new StringBuilder(data.length() + 16);
StringBuilder keywordBuf = new StringBuilder(4);
StringBuilder origDataBuf = new StringBuilder();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (int i = 0; i < data.length(); i++) {
char c1 = data.charAt(i);
keywordBuf.append(c1);
if (c1 == '\\' && data.length()>i+1) {
i++;
char c2 = data.charAt(i);
keywordBuf.append(c2);
if (c2 == '\'') {
i++;
char c3 = data.charAt(i);
keywordBuf.append(c3);
if ((c3 >= '0' && c3 <= '9') || (c3 >= 'a' && c3 <= 'f')
|| (c3 >= 'A' && c3 <= 'F')) {
i++;
char c4 = data.charAt(i);
keywordBuf.append(c4);
if ((c4 >= '0' && c4 <= '9')
|| (c4 >= 'a' && c4 <= 'f')
|| (c4 >= 'A' && c4 <= 'F')) {
int value = Integer.parseInt(
String.valueOf(new char[] { c3, c4 }), 16);
baos.write(value);
origDataBuf.append(keywordBuf.toString());
keywordBuf.delete(0, 4);
continue;
}
}
}
}
if (baos.size() != 0) {
try {
appendUnicodeStr(dataBuf, new String(baos.toByteArray(),
enc));
} catch (UnsupportedEncodingException e) {
dataBuf.append(origDataBuf.toString());
}
origDataBuf.delete(0, origDataBuf.length());
baos.reset();
}
dataBuf.append(keywordBuf.toString());
keywordBuf.delete(0, 4);
}
if (baos.size() != 0) {
try {
appendUnicodeStr(dataBuf, new String(baos.toByteArray(), enc));
} catch (UnsupportedEncodingException e) {
dataBuf.append(origDataBuf.toString());
}
}
return dataBuf.toString();
}
private void appendUnicodeStr(StringBuilder dataBuf, String value) {
for (int j = 0; j < value.length(); j++) {
char ch = value.charAt(j);
if (ch >= 20 && ch < 80) {
dataBuf.append(ch);
} else {
dataBuf.append("{\\u");
dataBuf.append((int) ch);
dataBuf.append('}');
}
}
}
private File createUnicodeRtfTempFile(InputStream in) throws IOException {
boolean isDelete = false;
File tempFile = null;
BufferedOutputStream out = null;
try {
tempFile = File.createTempFile("temp", ".rtf");
out = new BufferedOutputStream(new FileOutputStream(tempFile));
String defaultCharset = "windows-1251"; // ansi
String defaultFont = "0";
Map<String, String> fontTableMap = new HashMap<String, String>();
StringBuilder dataBuf = new StringBuilder(255);
int ch;
LinkedList<String> charsetQueue = new LinkedList<String>();
int depth = 0;
String prevFt = null;
int prevCh = -1;
while ((ch = in.read()) != -1) {
if ( ((ch == '{' || ch == '}') && prevCh!='\\') || ( ch == ' ' && (! FONT_FAMILY_PATTERN.matcher(dataBuf.toString()).find())) ) {
if (charsetQueue.size() > depth + 1) {
charsetQueue.removeLast();
}
String data = dataBuf.toString();
data = data.replace("\\cell","\\u0020\\cell");
if(data.indexOf("\\colortbl")!=-1){
// End of font table, clear last/previous font encountered.
prevFt = null;
}
if (depth == 1) {
// check control words for a default charset
String cset = loadAnsiCpg(data);
if (cset != null) {
defaultCharset = cset;
}
Matcher matcher = DEFAULT_FONT_PATTERN.matcher(data);
if(matcher.find()){
defaultFont = matcher.group(1);
}
}
String ft = loadFontTable(data);
String charset = loadCharset(data);
if (ft != null && charset != null) {
fontTableMap.put(ft, charset);
}
if (ft == null && prevCh == ' ') {
ft = prevFt;
} else if (ft != null) {
prevFt = ft;
}
if(ft==null){
ft = defaultFont;
}
// set a current charset
if (charset == null && ft != null) {
charset = fontTableMap.get(ft);
}
if (charset == null && charsetQueue.size() > 0) {
charset = charsetQueue.getLast();
}
if (charset == null) {
charset = defaultCharset;
}
// add the current charset to a queue
if (charsetQueue.size() < depth + 1) {
charsetQueue.add(charset);
}
String escapedStr = "windows-1251".equals(charset) ? data
: escapeByUnicode(data, charset);
out.write(escapedStr.getBytes("UTF-8"));
out.write(ch);
dataBuf.delete(0, dataBuf.length());
prevCh = ch;
// update a depth
if (ch == '{') {
depth++;
} else if (ch == '}') {
depth--;
}
} else {
dataBuf.append((char) ch);
}
}
out.flush();
} catch (IOException e) {
isDelete = true;
throw e;
} finally {
IOUtils.closeQuietly(out);
if (isDelete && tempFile != null) {
tempFile.delete();
}
}
return tempFile;
}
private String loadFontTable(String line) {
Matcher m = F_PATTERN.matcher(line);
String font = null;
while((m.find())) {
font = m.group(1);
}
return font;
}
private String loadAnsiCpg(String line) {
Matcher m = ANSICPG_PATTERN.matcher(line);
String charset = null;
if (m.find()) {
int encVal;
try {
encVal = Integer.parseInt(m.group().substring(8));
charset = FONTSET_MAP.get(encVal);
} catch (NumberFormatException e) {
// ignore
}
}
return charset;
}
private String loadCharset(String line) {
Matcher m = FCHARSET_PATTERN.matcher(line);
String charset = null;
if (m.find()) {
int encVal;
try {
encVal = Integer.parseInt(m.group().substring(9));
} catch (NumberFormatException e) {
encVal = 0;
}
charset = FONTSET_MAP.get(encVal);
}
return charset;
}
/**
* Customized version of {@link DefaultStyledDocument}. Adds whitespace
* to places where words otherwise could have run together (see
* <a href="https://issues.apache.org/jira/browse/TIKA-392">TIKA-392</a>),
* and works around the problem of Swing expecting a GUI environment (see
* <a href="https://issues.apache.org/jira/browse/TIKA-282">TIKA-282</a>).
*/
private static class CustomStyledDocument extends DefaultStyledDocument {
private boolean isPrevUnicode = false;
public CustomStyledDocument() {
super(new NoReclaimStyleContext());
}
@Override
public void insertString(int offs, String str, AttributeSet a)
throws BadLocationException {
boolean isUnicode = str.length() == 1 && str.charAt(0) > 127;
if (offs > 0 && offs == getLength() && !isPrevUnicode && !isUnicode) {
super.insertString(offs, " ", a);
super.insertString(getLength(), str, a);
} else {
super.insertString(offs, str, a);
}
isPrevUnicode = isUnicode;
}
}
/**
* A workaround to
* <a href="https://issues.apache.org/jira/browse/TIKA-282">TIKA-282</a>:
* RTF parser expects a GUI environment. This class simply disables the
* troublesome SwingUtilities.isEventDispatchThread() call that's made in
* the {@link StyleContext#reclaim(AttributeSet)} method.
*/
private static class NoReclaimStyleContext extends StyleContext {
/** Ignored. */
public void reclaim(AttributeSet a) {
}
}
}