Package ch.akuhn.hapax.corpus

Source Code of ch.akuhn.hapax.corpus.CorpusBuilderHelper

package ch.akuhn.hapax.corpus;

import static ch.akuhn.util.Get.each;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;

import ch.akuhn.hapax.util.Ziperator;
import ch.akuhn.hapax.util.Ziperator.Entry;
import ch.akuhn.util.Files;
import ch.akuhn.util.Throw;


public class CorpusBuilderHelper {

    private Corpus corpus;

    public CorpusBuilderHelper(Corpus corpus) {
        this.corpus = corpus;
    }

    public Corpus importAllFiles(File folder, String... extensions) {
        for (File each : Files.find(folder, extensions)) {
            corpus.putDocument(each.getAbsolutePath(), new Terms(each));
        }
        return corpus;
    }

    public Corpus importAllZipArchives(File folder, String... extensions) {
        for (File file : Files.find(folder, ".zip", ".jar")) {
            this.importZipArchive(file, extensions);
        }
        return corpus;
    }

    //    public Corpus importAllZipArchivesPackageWise(File folder, String... extensions) {
    //        for (File file : Files.find(folder, ".zip", ".jar")) {
    //            System.err.printf("importing file: %s\n", file.getName());
    //            this.importZipArchivePackageWise(file, extensions);
    //        }
    //        return corpus;
    //    }   

    //    public Corpus importZipArchivePackageWise(String path, String... extensions) {
    //        return this.importZipArchivePackageWise(new File(path), extensions);
    //    }

    //    public Corpus importZipArchivePackageWise(File file, String... extensions) {
    //        try {
    //            Map<String,Document> packages = new HashMap<String,Document>();
    //            ZipFile zip = new ZipFile(file);
    //            String version = file.getName();
    //           
    //            for (ZipEntry entry : each(zip.entries())) {
    //                String name = entry.getName();
    //                int endIndex = name.lastIndexOf('/'); // XXX don't use system file separator!
    //                if (endIndex < 0 || entry.isDirectory()) continue;
    //               
    //                for (String suffix : extensions) {
    //                    if (!name.endsWith(suffix)) continue;
    //                    InputStream in = zip.getInputStream(entry);
    //                    Terms terms = new Terms(in).intern();                   
    //                    String directory = name.substring(0, endIndex + 1);
    //                    if (!packages.containsKey(directory)) {
    //                        packages.put(directory, corpus.makeDocument(directory, version));
    //                    }
    //                    Document document = packages.get(directory);
    //                    document.addTerms(terms);
    //                    break;
    //                }
    //            }
    //            return corpus;
    //        } catch (ZipException ex) {
    //            throw Throw.exception(ex);
    //        } catch (IOException ex) {
    //            throw Throw.exception(ex);
    //        }
    //    }

    public Corpus importZipArchive(File file, String... extensions) {
        try {
            ZipFile zip = new ZipFile(file);
            for (ZipEntry entry : each(zip.entries())) {
                for (String suffix : extensions) {
                    if (!entry.getName().endsWith(suffix)) continue;
                    InputStream in = zip.getInputStream(entry);
                    Terms terms = new Terms(in).intern();
                    corpus.putDocument(entry.getName(), terms);
                    break;
                }
            }
            return corpus;
        } catch (ZipException ex) {
            throw Throw.exception(ex);
        } catch (IOException ex) {
            throw Throw.exception(ex);
        }
    }

    public void importZipArchive(String name, String extensions) {
        this.importZipArchive(new File(name), extensions);
    }

    public Corpus getCorpus() {
        return corpus;
    }

    public void importFrom(String source, String... extensions) {
        File file = new File(source);
        assert file.exists() : source;
        if (file.isDirectory()) importAllFiles(file, extensions);
        else smartImportZipArchive(file, extensions);
    }

    private void smartImportZipArchive(File file, String... extensions) {
        // first try to get files from nested sources
        Entry nestedSources = null;
        for (Entry each: new Ziperator(file)) {
            if (nestedSources == null && each.isSourceArchive()) nestedSources = each;
            if (each.parent != null && each.parent == nestedSources) {
                for (String ext: extensions) {
                    if (each.entry.getName().endsWith(ext)) {
                        corpus.putDocument(each.toString(), new Terms(each.in));
                        break;
                    }
                }
            }
        }
        if (nestedSources == null) importZipArchive(file, extensions);
    }

}
TOP

Related Classes of ch.akuhn.hapax.corpus.CorpusBuilderHelper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.