Package com.googlecode.gaal.vis

Source Code of com.googlecode.gaal.vis.VectorVisualizer

package com.googlecode.gaal.vis;

import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import com.googlecode.gaal.analysis.api.Filter;
import com.googlecode.gaal.analysis.api.VectorBuilder;
import com.googlecode.gaal.analysis.impl.IntervalVectorBuilder;
import com.googlecode.gaal.analysis.impl.ProperIntervalSetBuilder;
import com.googlecode.gaal.analysis.impl.RecursiveIntervalExtractor;
import com.googlecode.gaal.analysis.impl.SimpleContextFilter;
import com.googlecode.gaal.data.api.Corpus;
import com.googlecode.gaal.data.api.IntSequence;
import com.googlecode.gaal.data.api.Vector;
import com.googlecode.gaal.data.impl.ArraySequence;
import com.googlecode.gaal.data.impl.SparseVector;
import com.googlecode.gaal.data.impl.TreeMapCorpus;
import com.googlecode.gaal.preprocess.api.Tokenizer;
import com.googlecode.gaal.preprocess.impl.LowerCaseNormalizer;
import com.googlecode.gaal.preprocess.impl.MultidocumentRegexTokenizer;
import com.googlecode.gaal.suffix.api.EmbeddedSuffixTree.EmbeddedInterval;
import com.googlecode.gaal.suffix.api.IntervalTree.Interval;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree;
import com.googlecode.gaal.suffix.impl.LinearizedSuffixTreeImpl;
import com.googlecode.gaal.vis.api.VectorDrawing;
import com.googlecode.gaal.vis.impl.TikzConstants;

public class VectorVisualizer {

    public static final String STRING_REGEX = "([A-Z0-9a-züäöß]+'?[a-z0-9]*)|[\\.,;\\(\\)]";

    private static final String DELIMITER = " ";

    private static final Set<String> SEPARATORS;
    static {
        SEPARATORS = new HashSet<String>();
        SEPARATORS.add(".");
        SEPARATORS.add(",");
    }

    private static final Vector X_AXIS = new SparseVector();
    static {
        X_AXIS.add(0, 1);
    }

    protected final Corpus<String> srcCorpus;
    protected final Corpus<String> dstCorpus;
    protected LinearizedSuffixTree srcLST;
    protected LinearizedSuffixTree dstLST;
    protected final ArraySequence srcSequence;
    protected final ArraySequence dstSequence;

    protected List<Vector> srcVectors;
    protected List<Vector> dstVectors;
    protected List<Interval> srcObjects;
    protected List<Interval> dstObjects;

    public VectorVisualizer(String srcFileName, String dstFileName, int windowSize) throws FileNotFoundException {
        FileReader srcReader = new FileReader(srcFileName);
        FileReader dstReader = new FileReader(dstFileName);
        Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX,
                new LowerCaseNormalizer());
        Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX,
                new LowerCaseNormalizer());
        srcCorpus = new TreeMapCorpus(srcTokenizer, SEPARATORS);
        dstCorpus = new TreeMapCorpus(dstTokenizer, SEPARATORS);
        // IntervalSetBuilder intervalSetBuilder = new SupermaximalSetBuilder();
        srcSequence = srcCorpus.sequence();
        dstSequence = dstCorpus.sequence();
        srcLST = new LinearizedSuffixTreeImpl(srcSequence, srcCorpus.alphabetSize());
        dstLST = new LinearizedSuffixTreeImpl(dstSequence, dstCorpus.alphabetSize());
        srcVectors = new ArrayList<Vector>();
        dstVectors = new ArrayList<Vector>();
        srcObjects = new ArrayList<Interval>();
        dstObjects = new ArrayList<Interval>();
        VectorBuilder<Interval> vectorBuilder = new IntervalVectorBuilder<Interval>();
        Filter<EmbeddedInterval> contextFilter = new SimpleContextFilter(0, 0);
        Iterator<Interval> srcIntervalIterator = new RecursiveIntervalExtractor(srcLST, srcCorpus,
                new ProperIntervalSetBuilder(), contextFilter, windowSize).iterator();
        Iterator<Interval> dstIntervalIterator = new RecursiveIntervalExtractor(dstLST, dstCorpus,
                new ProperIntervalSetBuilder(), contextFilter, windowSize).iterator();
        vectorBuilder.buildVectors(srcIntervalIterator, srcVectors, srcObjects, srcCorpus, -1);
        vectorBuilder.buildVectors(dstIntervalIterator, dstVectors, dstObjects, dstCorpus, -1);
    }

    public void visualizeVectors(VectorDrawing drawing) {
        List<String> labels = Arrays.asList(new String[] { "both$\\dots$and", "sowohl$\\dots$als auch" });
        Map<Integer, Vector> srcSelectedVectors = getVectors(srcCorpus, srcVectors, srcObjects, labels);
        Map<Integer, Vector> dstSelectedVectors = getVectors(dstCorpus, dstVectors, dstObjects, labels);

        drawVectors(drawing, srcSelectedVectors, labels);
        drawVectors(drawing, dstSelectedVectors, labels);

        drawAngles(drawing, new ArrayList<Vector>(srcSelectedVectors.values()), new ArrayList<Vector>(
                dstSelectedVectors.values()));

        Map<Integer, Integer> srcRepeatsMap = mapRepeats(srcCorpus, srcObjects, labels);
        Map<Integer, Integer> dstRepeatsMap = mapRepeats(dstCorpus, dstObjects, labels);

        List<String> srcDocuments = toDocumentList(srcCorpus, srcRepeatsMap);
        List<String> dstDocuments = toDocumentList(dstCorpus, dstRepeatsMap);
        drawDocuments(drawing, srcDocuments, dstDocuments);
    }

    public void visualizeRepeats(VectorDrawing drawing, List<String> labels) {
        Map<Integer, Vector> srcSelectedVectors = getVectors(dstCorpus, dstVectors, dstObjects, labels);

        drawVectors(drawing, srcSelectedVectors, labels);

        Map<Integer, Integer> srcRepeatsMap = mapRepeats(dstCorpus, dstObjects, labels);

        List<String> srcDocuments = toDocumentList(dstCorpus, srcRepeatsMap);
        drawDocuments(drawing, srcDocuments);
    }

    public void visualizeCorpus(VectorDrawing drawing) {
        drawing.drawDocument(toDocument(srcCorpus), false);
    }

    public void visualizeDocuments(VectorDrawing drawing) {

        @SuppressWarnings("unchecked")
        List<String> srcDocuments = toDocumentList(srcCorpus, Collections.EMPTY_MAP);
        drawDocuments(drawing, srcDocuments);
    }

    public void visualizeParallelCorpus(VectorDrawing drawing) {
        drawing.drawDocument(toDocument(srcCorpus), toDocument(dstCorpus), false);
    }

    public void visualizeParallelDocuments(VectorDrawing drawing) {

        @SuppressWarnings("unchecked")
        List<String> srcDocuments = toDocumentList(srcCorpus, Collections.EMPTY_MAP);
        @SuppressWarnings("unchecked")
        List<String> dstDocuments = toDocumentList(dstCorpus, Collections.EMPTY_MAP);
        drawDocuments(drawing, srcDocuments, dstDocuments);
    }

    public void visualizeTermDocumentMatrix(VectorDrawing drawing) {
        List<Map<String, Integer>> termMapList = toTermMapList(srcCorpus);
        List<String> documentLabels = new ArrayList<String>(termMapList.size());
        for (int i = 0; i < termMapList.size(); i++) {
            documentLabels.add(String.format("$d_%d$", i));
        }
        Set<String> termSet = new TreeSet<String>();
        for (Map<String, Integer> termMap : termMapList) {
            termSet.addAll(termMap.keySet());
        }
        List<String> termLabels = new ArrayList<String>(termSet);
        int[][] values = new int[termLabels.size()][documentLabels.size()];
        int docNumber = 0;
        for (Map<String, Integer> termMap : termMapList) {
            for (Map.Entry<String, Integer> entry : termMap.entrySet()) {
                int i = termLabels.indexOf(entry.getKey());
                values[i][docNumber] = entry.getValue();
            }
            docNumber++;
        }
        drawing.drawMatrix(termLabels, documentLabels, values);
    }

    public void visualizeWordContextMatrix(VectorDrawing drawing) {
        List<Map<String, Integer>> termMapList = toTermMapList(srcCorpus);
        List<String> documentLabels = new ArrayList<String>(termMapList.size());
        for (int i = 0; i < termMapList.size(); i++) {
            documentLabels.add(String.format("$d_%d$", i));
        }
        Set<String> termSet = new TreeSet<String>();
        for (Map<String, Integer> termMap : termMapList) {
            termSet.addAll(termMap.keySet());
        }
        List<String> termLabels = new ArrayList<String>(termSet);
        int[][] values = new int[documentLabels.size()][termLabels.size()];
        int docNumber = 0;
        for (Map<String, Integer> termMap : termMapList) {
            for (Map.Entry<String, Integer> entry : termMap.entrySet()) {
                int i = termLabels.indexOf(entry.getKey());
                values[docNumber][i] = entry.getValue();
            }
            docNumber++;
        }
        drawing.drawMatrix(documentLabels, termLabels, values);
    }

    public static String toString(Interval interval, Corpus<String> corpus) {
        if (interval instanceof EmbeddedInterval) {
            EmbeddedInterval embeddedInterval = (EmbeddedInterval) interval;
            return String.format("%s$\\dots$%s", toString(embeddedInterval.getEmbeddingInterval(), corpus),
                    corpus.toString(embeddedInterval.label(), DELIMITER));
        }
        return corpus.toString(interval.label(), DELIMITER);
    }

    public static void mapInterval(Interval interval, IntSequence indices, Map<Integer, Integer> repeatsMap, int number) {
        if (indices == null) {
            indices = interval.indices();
        }
        int lcp = interval.lcp();
        for (int i = 0; i < indices.size(); i++) {
            for (int j = 0; j < lcp; j++) {
                repeatsMap.put(indices.get(i) + j, number);
            }
        }
        if (interval instanceof EmbeddedInterval) {
            EmbeddedInterval embeddedInterval = (EmbeddedInterval) interval;
            mapInterval(embeddedInterval.getEmbeddingInterval(), embeddedInterval.embeddingIndices(), repeatsMap,
                    number);
        }
    }

    public static IntSequence removeIntervening(EmbeddedInterval interval) {
        IntSequence indices = interval.indices();
        IntSequence embeddingIndices = interval.embeddingIndices();
        System.out.format("ind:%s\n", indices);
        System.out.format("emb:%s\n", embeddingIndices);
        BitSet removedIndices = new BitSet(embeddingIndices.size());
        for (int i = 0; i < embeddingIndices.size(); i++) {
            if (!removedIndices.get(i)) {
                int index = indices.get(i);
                int embeddingIndex = embeddingIndices.get(i);
                for (int j = 0; j < embeddingIndices.size(); j++) {
                    if (i != j && !removedIndices.get(j) && embeddingIndices.get(j) == embeddingIndex) {
                        System.out.format("%d=%d\n", embeddingIndices.get(j), embeddingIndex);
                        if (indices.get(j) > index) {
                            System.out.format("removing i=%d\n", i);
                            removedIndices.set(i);
                        } else {
                            System.out.format("removing j=%d\n", j);
                            removedIndices.set(j);
                        }
                        break;
                    }
                }
            }
        }
        if (removedIndices.cardinality() == 0) {
            return indices;
        } else {
            int[] newIndices = new int[indices.size() - removedIndices.cardinality()];
            int counter = 0;
            for (int i = 0; i < indices.size(); i++) {
                if (!removedIndices.get(i)) {
                    newIndices[counter++] = indices.get(i);
                }
            }
            return new ArraySequence(newIndices);
        }
    }

    private static void drawVectors(VectorDrawing drawing, Map<Integer, Vector> selectedVectors, List<String> labels) {
        for (Map.Entry<Integer, Vector> entry : selectedVectors.entrySet()) {
            Vector vector = entry.getValue();
            int index = entry.getKey();
            drawing.drawVector(vector.get(0), vector.get(1), labels.get(index), TikzConstants.VECTOR_STYLES[index]);
        }
    }

    private static void drawAngles(VectorDrawing drawing, List<Vector> srcVectors, List<Vector> dstVectors) {
        for (int i = 0; i < srcVectors.size(); i++) {
            Vector srcVector = srcVectors.get(i);
            Vector dstVector = dstVectors.get(i);
            double srcToXAngle = Math.toDegrees(Math.acos(X_AXIS.similarity(srcVector)));
            double dstToXAngle = Math.toDegrees(Math.acos(X_AXIS.similarity(dstVector)));
            drawing.drawAngle(Math.min(srcToXAngle, dstToXAngle), Math.max(srcToXAngle, dstToXAngle), "$\\theta$");
        }
    }

    private static void drawDocuments(VectorDrawing drawing, List<String> srcDocuments, List<String> dstDocuments) {
        for (int i = 0; i < srcDocuments.size(); i++) {
            drawing.drawDocument(srcDocuments.get(i), dstDocuments.get(i), true);
        }
    }

    private static void drawDocuments(VectorDrawing drawing, List<String> documents) {
        for (int i = 0; i < documents.size(); i++) {
            drawing.drawDocument(documents.get(i), true);
        }
    }

    private static Map<Integer, Integer> mapRepeats(Corpus<String> corpus, List<Interval> intervals, List<String> labels) {
        Map<Integer, Integer> repeatsMap = new HashMap<Integer, Integer>();
        for (int i = 0; i < intervals.size(); i++) {
            Interval interval = intervals.get(i);
            String label = toString(interval, corpus);
            int index = labels.indexOf(label);
            if (index != -1) {
                mapInterval(interval, null, repeatsMap, index);
            }
        }
        return repeatsMap;
    }

    private static List<String> toDocumentList(Corpus<String> corpus, Map<Integer, Integer> repeatsMap) {
        IntSequence sequence = corpus.sequence();
        List<String> documents = new ArrayList<String>();
        StringBuilder sb = new StringBuilder();
        int prevRepeatNumber = -1;
        int docNumber = 0;
        boolean isFirst = true;
        boolean isBlockStart = false;
        for (int i = 0; i < sequence.size(); i++) {
            Integer repeatNumber = repeatsMap.get(i);
            if (prevRepeatNumber != -1 && (repeatNumber == null || prevRepeatNumber != repeatNumber)) {
                sb.append("}");
            }
            if (repeatNumber != null && prevRepeatNumber != repeatNumber) {
                sb.append(String.format(" {\\color{%s}", TikzConstants.VECTOR_STYLES[repeatNumber].getColour()));
                isBlockStart = true;
            }
            if (repeatNumber != null) {
                prevRepeatNumber = repeatNumber;
            } else {
                prevRepeatNumber = -1;
            }
            int docId = corpus.getDocumentId(i);
            if (docNumber != docId) {
                docNumber = docId;
                documents.add(sb.toString());
                sb = new StringBuilder();
            }
            int symbol = sequence.get(i);
            String token = corpus.toToken(symbol);
            if (!corpus.isSeparator(symbol)) {
                if (isFirst) {
                    isFirst = false;
                } else if (!token.equals(",") && !token.equals(".") && !isBlockStart) {
                    sb.append(DELIMITER);
                }
                sb.append(token);
            }
            if (isBlockStart)
                isBlockStart = false;
        }
        documents.add(sb.toString());
        return documents;
    }

    private static String toDocument(Corpus<String> corpus) {
        IntSequence sequence = corpus.sequence();
        StringBuilder sb = new StringBuilder();
        boolean isFirst = true;
        for (int i = 0; i < sequence.size(); i++) {
            int symbol = sequence.get(i);
            String token = corpus.toToken(symbol);
            if (!corpus.isSeparator(symbol)) {
                if (isFirst) {
                    isFirst = false;
                } else if (!token.equals(",") && !token.equals(".")) {
                    sb.append(DELIMITER);
                }
                sb.append(token);
            }
        }
        return sb.toString();
    }

    private static List<Map<String, Integer>> toTermMapList(Corpus<String> corpus) {
        List<Map<String, Integer>> documentList = new ArrayList<Map<String, Integer>>();
        IntSequence sequence = corpus.sequence();
        Map<String, Integer> termMap = null;
        int docNumber = -1;
        boolean isFirst = true;
        for (int i = 0; i < sequence.size(); i++) {
            int docId = corpus.getDocumentId(i);
            if (docNumber != docId) {
                docNumber = docId;
                if (termMap != null)
                    documentList.add(termMap);
                termMap = new TreeMap<String, Integer>();
            }
            int symbol = sequence.get(i);
            String token = corpus.toToken(symbol);
            if (!corpus.isSeparator(symbol)) {
                if (isFirst) {
                    isFirst = false;
                } else if (!token.equals(",") && !token.equals(".")) {
                    Integer count = termMap.get(token);
                    termMap.put(token, (count == null ? 1 : count + 1));
                }
            }
        }
        documentList.add(termMap);
        return documentList;
    }

    private static Map<Integer, Vector> getVectors(Corpus<String> corpus, List<Vector> vectors,
            List<Interval> intervals, List<String> labels) {
        Map<Integer, Vector> vectorMap = new TreeMap<Integer, Vector>();
        for (int i = 0; i < vectors.size(); i++) {
            Vector vector = vectors.get(i);
            String label = toString(intervals.get(i), corpus);
            int index = labels.indexOf(label);
            System.out.printf("testing:%s\n", label);
            if (index != -1) {
                System.out.printf("match:%s\n", label);
                vectorMap.put(index, vector);
            }
        }
        return vectorMap;
    }
}
TOP

Related Classes of com.googlecode.gaal.vis.VectorVisualizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.