Package com.googlecode.gaal.data.impl

Examples of com.googlecode.gaal.data.impl.TreeMapCorpus


        FileReader dstReader = new FileReader(dstFileName);
        Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX,
                new LowerCaseNormalizer());
        Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX,
                new LowerCaseNormalizer());
        srcCorpus = new TreeMapCorpus(srcTokenizer, SEPARATORS);
        dstCorpus = new TreeMapCorpus(dstTokenizer, SEPARATORS);
        // IntervalSetBuilder intervalSetBuilder = new SupermaximalSetBuilder();
        srcSequence = srcCorpus.sequence();
        dstSequence = dstCorpus.sequence();
        srcLST = new LinearizedSuffixTreeImpl(srcSequence, srcCorpus.alphabetSize());
        dstLST = new LinearizedSuffixTreeImpl(dstSequence, dstCorpus.alphabetSize());
View Full Code Here


        FileReader dstReader = new FileReader(dstFileName);
        Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX, new StopWordRemover(
                srcStopWords, new LowerCaseNormalizer()));
        Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX, new StopWordRemover(
                dstStopWords, new LowerCaseNormalizer()));
        Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators);
        Corpus<String> dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators);

        double minSimilarity = 0.3;
        int alignmentsNumber = 3;
        int minVectorSize = 5;
        int windowSize = 9;
View Full Code Here

                System.err.printf("can't open source file: ", e.getMessage());
                System.exit(1);
            }
            Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, regex, new StopWordRemover(
                    srcStopWords, new LowerCaseNormalizer()));
            Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators, corpusSize);
            Corpus<String> dstCorpus = null;
            if (dstFileName != null) {
                try {
                    dstReader = new FileReader(dstFileName);
                } catch (FileNotFoundException e) {
                    System.err.printf("can't open target file: ", e.getMessage());
                    System.exit(1);
                }
                Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, regex, new StopWordRemover(
                        dstStopWords, new LowerCaseNormalizer()));
                dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators, corpusSize);
            }
            ANALYSER = new ConcurrentAnalyser(srcCorpus, dstCorpus);
        }
        return ANALYSER;
    }
View Full Code Here

        // "n", "r", "s", "t", "y", "#" };
        // StringReader sr = new StringReader("caggtcagtcacggtatca#");
        // String[] alphabet = { null, "a", "c", "g", "t", "#" };

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, "[\\W\\w]", new LowerCaseNormalizer());
        Corpus<String> corpus = new TreeMapCorpus(tokenizer, alphabet);
        System.out.println("alphabet size " + corpus.alphabetSize());
        System.out.println("text size " + corpus.sequence().size());
        System.out.println("text " + corpus.sequence());
        System.out.println("alphabet " + corpus.alphabet());
        Iterator<String> iter = corpus.iterator(corpus.sequence());
        while (iter.hasNext())
            System.out.print(iter.next());
        System.out.println();

        Iterator<String> revIter = corpus.iterator(corpus.sequence().reverse());
        while (revIter.hasNext())
            System.out.print(revIter.next());
        System.out.println();

        // LinearizedSuffixTree sa = new KimLinearizedSuffixTree(
View Full Code Here

TOP

Related Classes of com.googlecode.gaal.data.impl.TreeMapCorpus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.