Examples of ASCIIFoldingFilter

See: http://en.wikipedia.org/wiki/Latin_characters_in_Unicode The set of character conversions supported by this class is a superset of those supported by Lucene's {@link ISOLatin1AccentFilter} which stripsaccents from Latin1 characters. For example, 'à' will be replaced by 'a'.
  • org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
    nicode.org/charts/PDF/U0080.pdf">http://www.unicode.org/charts/PDF/U0080.pdf
  • Latin Extended-A: http://www.unicode.org/charts/PDF/U0100.pdf
  • Latin Extended-B: http://www.unicode.org/charts/PDF/U0180.pdf
  • Latin Extended Additional: http://www.unicode.org/charts/PDF/U1E00.pdf
  • Latin Extended-C: http://www.unicode.org/charts/PDF/U2C60.pdf
  • Latin Extended-D: http://www.unicode.org/charts/PDF/UA720.pdf
  • IPA Extensions: http://www.unicode.org/charts/PDF/U0250.pdf
  • Phonetic Extensions: http://www.unicode.org/charts/PDF/U1D00.pdf
  • Phonetic Extensions Supplement: http://www.unicode.org/charts/PDF/U1D80.pdf
  • General Punctuation: http://www.unicode.org/charts/PDF/U2000.pdf
  • Superscripts and Subscripts: http://www.unicode.org/charts/PDF/U2070.pdf
  • Enclosed Alphanumerics: http://www.unicode.org/charts/PDF/U2460.pdf
  • Dingbats: http://www.unicode.org/charts/PDF/U2700.pdf
  • Supplemental Punctuation: http://www.unicode.org/charts/PDF/U2E00.pdf
  • Alphabetic Presentation Forms: http://www.unicode.org/charts/PDF/UFB00.pdf
  • Halfwidth and Fullwidth Forms: http://www.unicode.org/charts/PDF/UFF00.pdf See: http://en.wikipedia.org/wiki/Latin_characters_in_Unicode For example, 'à' will be replaced by 'a'.

  • Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

        @Override
        protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
            // on exact - we don't care about suffixes at all, we always output original word with suffix only
            final HebrewTokenizer src = new HebrewTokenizer(reader, prefixesTree, SPECIAL_TOKENIZATION_CASES);
            TokenStream tok = new NiqqudFilter(src);
            tok = new ASCIIFoldingFilter(tok);
            tok = new LowerCaseFilter(matchVersion, tok);
            tok = new AddSuffixFilter(tok, '$') {
                @Override
                protected void handleCurrentToken() {
                    if (CommonGramsFilter.GRAM_TYPE.equals(typeAtt.type()) ||
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

            // basically, if analyzerType == AnalyzerType.INDEXING)
            final StreamLemmasFilter src = new StreamLemmasFilter(reader, dictRadix, prefixesTree, SPECIAL_TOKENIZATION_CASES, commonWords, lemmaFilter);
            src.setCustomWords(customWords);
            src.setKeepOriginalWord(true);

            TokenStream tok = new ASCIIFoldingFilter(src);
            tok = new AddSuffixFilter(tok, '$') {
                @Override
                protected void handleCurrentToken() {
                    if (HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew).equals(typeAtt.type())) {
                        if (keywordAtt.isKeyword())
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

            final StreamLemmasFilter src = new StreamLemmasFilter(reader, dictRadix, prefixesTree, SPECIAL_TOKENIZATION_CASES, commonWords, lemmaFilter);
            src.setCustomWords(customWords);
            src.setKeepOriginalWord(false);
            src.setSuffixForExactMatch(originalTermSuffix);

            TokenStream tok = new ASCIIFoldingFilter(src);
            //tok = new SuffixKeywordFilter(tok, '$');
            tok = new AddSuffixFilter(tok, '$') {
                @Override
                protected void handleCurrentToken() {
                    if (HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew).equals(typeAtt.type())) {
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

    *
    */
    public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
      @Override
      public ASCIIFoldingFilter create(TokenStream input) {
        return new ASCIIFoldingFilter(input);
      }
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

      public void testInvalidOffsets() throws Exception {
        Analyzer analyzer = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new EdgeNGramTokenFilter(filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
            return new TokenStreamComponents(tokenizer, filters);
          }
        };
        assertAnalyzesTo(analyzer, "mosfellsbær",
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

      public void testInvalidOffsets() throws Exception {
        Analyzer analyzer = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new EdgeNGramTokenFilter(Version.LUCENE_4_3, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
            return new TokenStreamComponents(tokenizer, filters);
          }
        };
        assertAnalyzesTo(analyzer, "mosfellsbær",
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

      public void testInvalidOffsets() throws Exception {
        Analyzer analyzer = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
            return new TokenStreamComponents(tokenizer, filters);
          }
        };
        assertAnalyzesTo(analyzer, "mosfellsbær",
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader);
        TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer);
        result = new LowerCaseFilter(LUCENE_VERSION, result);
        result = new ASCIIFoldingFilter(result);
        result = new AlphaNumericMaxLengthFilter(result);
        result = new StopFilter(LUCENE_VERSION, result, STOP_SET);
        result = new PorterStemFilter(result);
        return new TokenStreamComponents(tokenizer, result);
      }
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

      public void testInvalidOffset() throws Exception {
        Analyzer analyzer = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new WordTokenFilter(filters);
            return new TokenStreamComponents(tokenizer, filters);
          }
        };
       
    View Full Code Here

    Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

      public void testInvalidOffsets() throws Exception {
        Analyzer analyzer = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
            return new TokenStreamComponents(tokenizer, filters);
          }
        };
        assertAnalyzesTo(analyzer, "mosfellsbær",
    View Full Code Here
    TOP
    Copyright © 2018 www.massapi.com. All rights reserved.
    All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.