Package de.jetwick.util

Source Code of de.jetwick.util.Statistics

/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*         http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.util;

import com.google.api.translate.Language;

import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.google.inject.Module;
import de.jetwick.config.DefaultModule;
import de.jetwick.data.JTag;
import de.jetwick.es.ElasticTweetSearch;
import de.jetwick.data.JTweet;
import de.jetwick.data.JUser;
import de.jetwick.es.ElasticTagSearch;
import de.jetwick.es.ElasticUserSearch;
import de.jetwick.es.TweetQuery;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.elasticsearch.action.admin.indices.optimize.OptimizeResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Execute eg. via
* ./myjava  de.jetwick.util.Statistics exportNoiseWords=solr/conf/stopwords.txt
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class Statistics {

    private static Logger logger = LoggerFactory.getLogger(Statistics.class);

    public static void main(String[] args) throws Exception {
        Map<String, String> map = Helper.parseArguments(args);
        logger.info("arguments:" + map);
        if (args.length == 0)
            map.put("print", "timetabling");

        Module module = new DefaultModule();
        Injector injector = Guice.createInjector(module);
        injector.getInstance(Statistics.class).start(map);
    }
    @Inject
    private ElasticTweetSearch tweetSearch;
    @Inject
    private ElasticUserSearch userSearch;
    @Inject
    private ElasticTagSearch tagSearch; 

    public Statistics() {       
    }

    public void start(Map<String, String> map) throws Exception {
        String argStr = map.get("optimize");
        if (argStr != null) {
            int segments = 1;
            logger.info("Start optimizing for twindex");
            OptimizeResponse rsp = tweetSearch.optimize(tweetSearch.getIndexName(), segments);
            logger.info("Optimized twindex to " + segments + " segments for " + rsp.getSuccessfulShards() + "/" + rsp.getTotalShards() + " shards.\n Now uindex");
            rsp = tweetSearch.optimize(tweetSearch.getIndexName(), segments);
            logger.info("Optimized uindex  to " + segments + " segments for " + rsp.getSuccessfulShards() + "/" + rsp.getTotalShards() + " shards.");
        }

        argStr = map.get("listTweets");
        if (argStr != null) {
            if ("true".equals(argStr))
                argStr = "**:*";

            List<JUser> list = new ArrayList<JUser>();
            long ret = tweetSearch.query(list, new TweetQuery(argStr, false)).
                    getHits().getTotalHits();
            logger.info("Found: " + ret + " users. Returned: " + list.size());
            print(list);
            return;
        }

        // specify file via exportNoiseWords=stopwords.txt
        argStr = map.get("exportNoiseWords");
        if (argStr != null) {
            write(new TreeSet<String>(JTweet.NOISE_WORDS.keySet()), argStr);
            return;
        }

        argStr = map.get("importTags");
        if (argStr != null)
            importTags(map.get("tagFile"));

        argStr = map.get("clearPropertiesOfTags");
        if (argStr != null)
            clearPropertiesOfTags();

        argStr = map.get("readStopAndClear");
        if (argStr != null)
            readStopwords(JTweet.class.getResourceAsStream("noise_words_pt.txt"));//noise_words_fr.txt, lang_det_sp.txt

        argStr = map.get("translate");
        if (argStr != null)
            translate(Language.PORTUGUESE);
    }

    public void print(List list) {
        for (Object o : list) {
            System.out.println(o);
        }
    }

    public void importTags(String file) throws IOException {
        Set<String> newTags = new TreeSet<String>();
        for (String str : Helper.readFile(file)) {
            if (str.trim().length() > 1)
                newTags.add(JTag.toLowerCaseOnlyOnTerms(str.trim()));
        } // do only delete those where we don't have a new one
        // do only store tags which are new

        boolean ignoreSearchError = false;
        try {
            for (JTag tag : tagSearch.findAll(0, 1000)) {
                if (!newTags.contains(tag.getTerm()))
                    tagSearch.deleteByName(tag.getTerm());
                else
                    newTags.remove(tag.getTerm());
            }
        } catch (Exception ex) {
            ignoreSearchError = true;
            logger.info("Tag index seems to be not available or empty! Message:" + ex.getMessage());
        }

        tagSearch.addAll(newTags, true, ignoreSearchError);
        tagSearch.optimize();
        logger.info("Imported tag:" + newTags.size() + " all tags:" + tagSearch.findAll(0, 1000).size());
    }

    public void clearPropertiesOfTags() throws IOException {
        Set<JTag> newTags = new LinkedHashSet<JTag>();
        int counter = 0;
        for (JTag tag : tagSearch.findAll(0, 1000)) {
            counter++;
            newTags.add(tag.clearProperties());
        }
        tagSearch.bulkUpdate(newTags, tagSearch.getIndexName(), true);
        tagSearch.optimize();
        logger.info(counter + " Updated:" + newTags.size() + " tags " + newTags);
    }

    public void write(Set<String> words, String file) throws Exception {
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), Helper.UTF8));
        writer.write("# Written from YTweet via Statistics class! " + new Date());
        for (String str : words) {
            writer.write(str);
            writer.write("\n");
        }
        writer.close();
    }

    public void readStopwords(InputStream is) throws Exception {
        List<String> list = Helper.readFile(Helper.createBuffReader(is));
        Set<String> set = new TreeSet<String>();
        for (String str : list) {
            if (str.isEmpty() || str.startsWith("//"))
                continue;

            str = str.toLowerCase();
            if (str.contains(" "))
                for (String tmp : str.split(" ")) {
                    set.add(tmp.trim());
                }
            else
                set.add(str.trim());
        }

        for (String str : set) {
            System.out.println(str);
        }
    }

    public void translate(Language lang) throws Exception {
        List<String> list = Helper.readFile(Helper.createBuffReader(JTweet.class.getResourceAsStream("lang_det_en.txt")));
        Set<String> res = new TreeSet<String>();
        Set<String> cache = new LinkedHashSet<String>();
        int charCounter = 0;
        for (String str : list) {
            if (str.isEmpty() || str.startsWith("//"))
                continue;

            str = str.toLowerCase().trim();
            charCounter += str.length();
            cache.add(str);
            if (charCounter > 1500) {
                try {
                    String gTranslated = Translate.execute(cache.toString(), Language.ENGLISH, lang);
                    for (String tmp : gTranslated.split(",")) {
                        tmp = tmp.toLowerCase().trim().replaceAll("\\[", "").replaceAll("\\]", "");
                        res.add(tmp);
                    }
//                    System.out.println(tmp);
                } catch (Exception ex) {
                    logger.error("Cannot translate " + cache.size() + " lines", ex);
                }

                charCounter = 0;
                cache.clear();
            }
        }

        System.out.println("=======================\n\n");

        for (String str : res) {
            System.out.println(str);
        }
    }
}
TOP

Related Classes of de.jetwick.util.Statistics

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.