Package cc.twittertools.corpus.index

Source Code of cc.twittertools.corpus.index.TokenizationTest

package cc.twittertools.corpus.index;

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.io.StringReader;

import junit.framework.JUnit4TestAdapter;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.junit.Test;

import cc.twittertools.search.indexing.TweetAnalyzer;

public class TokenizationTest {

  @Test
  public void basic() throws Exception {
    Analyzer analyzer = new TweetAnalyzer(Version.LUCENE_41);

    String test1 = parseKeywords(
        analyzer,
        "AT&T getting secret immunity from wiretapping laws for government surveillance http://vrge.co/ZP3Fx5");
    assertEquals(
        "|att|get|secret|immun|from|wiretap|law|for|govern|surveil|http://vrge.co/ZP3Fx5|", test1);

    String test2 = parseKeywords(analyzer,
    // See comment in removeNonAlphanumeric() of
    // LowerCaseEntityPreservingFilter: isAlphabetic
    // will correctly scrub the trailing unicode "...", but is a JDK7 method.
        "want to see the @verge aston martin GT4 racer tear up long beach? http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219 …");
    assertEquals(
        "|want|to|see|the|@verge|aston|martin|gt4|racer|tear|up|long|beach|http://theracersgroup.kinja.com/watch-an-aston-martin-vantage-gt4-tear-around-long-beac-479726219|",
        test2);

    String test3 = parseKeywords(
        analyzer,
        "Incredibly good news! #Drupal users rally http://bit.ly/Z8ZoFe  to ensure blind accessibility contributor gets to @DrupalCon #Opensource");
    assertEquals(
        "|incred|good|new|#drupal|user|ralli|http://bit.ly/Z8ZoFe|to|ensur|blind|access|contributor|get|to|@drupalcon|#opensource|",
        test3);

    // Issue with this test case... comment out for now.
    String test4 = parseKeywords(
        analyzer,
        "We're entering the quiet hours at #amznhack. #Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz");
    assertEquals(
        "|were|enter|the|quiet|hour|at|#amznhack|#rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz|",
        test4);

    String test5 = parseKeywords(
        analyzer,
        "The 2013 Social Event Detection Task (SED) at #mediaeval2013, http://bit.ly/16nITsf  supported by @linkedtv @project_mmixer @socialsensor_ip");
    assertEquals(
        "|the|2013|social|event|detect|task|sed|at|#mediaeval2013|http://bit.ly/16nITsf|support|by|@linkedtv|@project_mmixer|@socialsensor_ip|",
        test5);

    String test6 = parseKeywords(analyzer,
        "U.S.A. U.K. U.K USA UK #US #UK #U.S.A #U.K ...A.B.C...D..E..F..A.LONG WORD");
    assertEquals("|usa|uk|uk|usa|uk|#us|#uk|#u|sa|#u|k|abc|d|e|f|a|long|word|", test6);

    String test7 = parseKeywords(analyzer, "this is @a_valid_mention and this_is_multiple_words");
    assertEquals("|thi|is|@a_valid_mention|and|thi|is|multipl|word|", test7);

    String test8 = parseKeywords(analyzer,
        "PLEASE BE LOWER CASE WHEN YOU COME OUT THE OTHER SIDE - ALSO A @VALID_VALID-INVALID");
    assertEquals(
        "|pleas|be|lower|case|when|you|come|out|the|other|side|also|a|@valid_valid|invalid|", test8);

    // Note: the at sign is not the normal (at) sign and the crazy hashtag is
    // not the normal #
    String test9 = parseKeywords(analyzer, "@reply @with #crazy ~#at");
    assertEquals("|@reply|@with|#crazy|#at|", test9);

    String test10 = parseKeywords(
        analyzer,
        ":@valid testing(valid)#hashtags. RT:@meniton (the last @mention is #valid and so is this:@valid), however this is@invalid");
    assertEquals(
        "|@valid|test|valid|#hashtags|rt|@meniton|the|last|@mention|is|#valid|and|so|is|thi|@valid|howev|thi|is|invalid|",
        test10);

    String test11 = parseKeywords(
        analyzer,
        "this][is[lots[(of)words+with-lots=of-strange!characters?$in-fact=it&has&Every&Single:one;of<them>in_here_B&N_test_test?test\\test^testing`testing{testing}testing…testing¬testing·testing what?");
    assertEquals(
        "|thi|is|lot|of|word|with|lot|of|strang|charact|in|fact|it|ha|everi|singl|on|of|them|in|here|bn|test|test|test|test|test|test|test|test|test|test|test|what|",
        test11);

  }

  public static junit.framework.Test suite() {
    return new JUnit4TestAdapter(TokenizationTest.class);
  }

  public String parseKeywords(Analyzer analyzer, String keywords) {
    StringBuilder sb = new StringBuilder("|");
    try {
      TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
      CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        if (cattr.toString().length() == 0)
          continue;
        sb.append(cattr.toString());
        sb.append("|");
      }
      tokenStream.end();
      tokenStream.close();

    } catch (IOException e) {
      throw new RuntimeException(e);
    }

    return sb.toString();
  }

}
TOP

Related Classes of cc.twittertools.corpus.index.TokenizationTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.