/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package ivory.core.tokenize;
import ivory.core.data.dictionary.DefaultCachedFrequencySortedDictionary;
import ivory.core.data.document.TermDocVector;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import org.apache.log4j.Logger;
import com.google.common.collect.Maps;
import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.util.array.ArrayListOfInts;
/**
* @author Tamer Elsayed
* @author Jimmy Lin
*/
public class DocumentProcessingUtils {
private static final Logger LOG = Logger.getLogger(DocumentProcessingUtils.class);
public static short TF_CUT = Short.MAX_VALUE;
public static SortedMap<Integer, int[]> integerizeTermDocVector(TermDocVector doc,
DefaultCachedFrequencySortedDictionary termIDMap) {
SortedMap<Integer, int[]> positions = Maps.newTreeMap();
TermDocVector.Reader reader = null;
try {
reader = doc.getReader();
} catch (IOException e1) {
throw new RuntimeException("Error getting TermDocVectorReader: " + e1.getMessage());
}
while (reader.hasMoreTerms()) {
int termid = termIDMap.getId(reader.nextTerm());
if (termid <= 0) {
continue;
}
positions.put(termid, reader.getPositions());
}
return positions;
}
public static Map<String, ArrayListOfInts> parseDocument(Indexable doc, Tokenizer tokenizer) {
Map<String, ArrayListOfInts> positions = Maps.newHashMap();
String text = doc.getContent();
String[] terms = tokenizer.processContent(text);
// The tokenizer may return terms with zero length (empty terms), and the tf may exceed the
// capacity of a short (in which case we need to handle separately).
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
// Guard against bad tokenization
if (term.length() == 0 || term.length() >= Byte.MAX_VALUE) {
continue;
}
// Remember, token position is numbered started from one...
if (positions.containsKey(term)) {
positions.get(term).add(i + 1);
} else {
ArrayListOfInts l = new ArrayListOfInts();
l.add(i + 1);
positions.put(term, l);
}
}
int doclength = 0;
Iterator<Map.Entry<String, ArrayListOfInts>> it = positions.entrySet().iterator();
Map.Entry<String, ArrayListOfInts> e;
ArrayListOfInts positionsList;
while (it.hasNext()) {
e = it.next();
positionsList = e.getValue();
// We're storing tfs as shorts, so check for overflow...
if (positionsList.size() >= TF_CUT) {
// There are a few ways to handle this... If we're getting such a high tf, then it most
// likely means that this is a junk doc.
LOG.warn("Error: tf of " + e.getValue()
+ " will overflow max short value. docno=" + doc.getDocid() + ", term="
+ e.getKey());
it.remove();
} else {
positionsList.trimToSize();
doclength += positionsList.size();
}
}
if ( positions.size() == 0 ) {
return positions;
}
positions.put("", new ArrayListOfInts(new int[] { doclength }));
return positions;
}
}