package ch.akuhn.hapax.corpus;
import static ch.akuhn.util.Get.each;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import ch.akuhn.hapax.util.Ziperator;
import ch.akuhn.hapax.util.Ziperator.Entry;
import ch.akuhn.util.Files;
import ch.akuhn.util.Throw;
public class CorpusBuilderHelper {
private Corpus corpus;
public CorpusBuilderHelper(Corpus corpus) {
this.corpus = corpus;
}
public Corpus importAllFiles(File folder, String... extensions) {
for (File each : Files.find(folder, extensions)) {
corpus.putDocument(each.getAbsolutePath(), new Terms(each));
}
return corpus;
}
public Corpus importAllZipArchives(File folder, String... extensions) {
for (File file : Files.find(folder, ".zip", ".jar")) {
this.importZipArchive(file, extensions);
}
return corpus;
}
// public Corpus importAllZipArchivesPackageWise(File folder, String... extensions) {
// for (File file : Files.find(folder, ".zip", ".jar")) {
// System.err.printf("importing file: %s\n", file.getName());
// this.importZipArchivePackageWise(file, extensions);
// }
// return corpus;
// }
// public Corpus importZipArchivePackageWise(String path, String... extensions) {
// return this.importZipArchivePackageWise(new File(path), extensions);
// }
// public Corpus importZipArchivePackageWise(File file, String... extensions) {
// try {
// Map<String,Document> packages = new HashMap<String,Document>();
// ZipFile zip = new ZipFile(file);
// String version = file.getName();
//
// for (ZipEntry entry : each(zip.entries())) {
// String name = entry.getName();
// int endIndex = name.lastIndexOf('/'); // XXX don't use system file separator!
// if (endIndex < 0 || entry.isDirectory()) continue;
//
// for (String suffix : extensions) {
// if (!name.endsWith(suffix)) continue;
// InputStream in = zip.getInputStream(entry);
// Terms terms = new Terms(in).intern();
// String directory = name.substring(0, endIndex + 1);
// if (!packages.containsKey(directory)) {
// packages.put(directory, corpus.makeDocument(directory, version));
// }
// Document document = packages.get(directory);
// document.addTerms(terms);
// break;
// }
// }
// return corpus;
// } catch (ZipException ex) {
// throw Throw.exception(ex);
// } catch (IOException ex) {
// throw Throw.exception(ex);
// }
// }
public Corpus importZipArchive(File file, String... extensions) {
try {
ZipFile zip = new ZipFile(file);
for (ZipEntry entry : each(zip.entries())) {
for (String suffix : extensions) {
if (!entry.getName().endsWith(suffix)) continue;
InputStream in = zip.getInputStream(entry);
Terms terms = new Terms(in).intern();
corpus.putDocument(entry.getName(), terms);
break;
}
}
return corpus;
} catch (ZipException ex) {
throw Throw.exception(ex);
} catch (IOException ex) {
throw Throw.exception(ex);
}
}
public void importZipArchive(String name, String extensions) {
this.importZipArchive(new File(name), extensions);
}
public Corpus getCorpus() {
return corpus;
}
public void importFrom(String source, String... extensions) {
File file = new File(source);
assert file.exists() : source;
if (file.isDirectory()) importAllFiles(file, extensions);
else smartImportZipArchive(file, extensions);
}
private void smartImportZipArchive(File file, String... extensions) {
// first try to get files from nested sources
Entry nestedSources = null;
for (Entry each: new Ziperator(file)) {
if (nestedSources == null && each.isSourceArchive()) nestedSources = each;
if (each.parent != null && each.parent == nestedSources) {
for (String ext: extensions) {
if (each.entry.getName().endsWith(ext)) {
corpus.putDocument(each.toString(), new Terms(each.in));
break;
}
}
}
}
if (nestedSources == null) importZipArchive(file, extensions);
}
}