Package org.apache.tika.language

Examples of org.apache.tika.language.LanguageIdentifier


   
    public void map(Text header, Text visibleText, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException {
  
      try {
       
        String language = new LanguageIdentifier(visibleText.toString()).getLanguage();
        reporter.getCounter("FilterEnglish.language", language).increment(1);       
       
        if ("en".equals(language)) {
          collector.collect(header, visibleText);
        }
View Full Code Here


        if (this.contentMaxlength != -1
                && titleandcontent.length() > this.contentMaxlength)
            titleandcontent = titleandcontent.substring(0, contentMaxlength);

        LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

        if (onlyCertain) {
            if (identifier.isReasonablyCertain())
                return identifier.getLanguage();
            else
                return null;
        }
        return identifier.getLanguage();
    }
View Full Code Here

 

  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          assertEquals(tokens[1], lang);
        }
      }
View Full Code Here

  }

  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          assertEquals(tokens[1], lang);
        }
      }
View Full Code Here

      String content = parse.getText();
      if (content != null) {
       text.append(" ").append(content.toString());
      }

      LanguageIdentifier identifier = new LanguageIdentifier(text.toString());

      if (onlyCertain) {
        if (identifier.isReasonablyCertain()) {
          return identifier.getLanguage();
        }
      } else {
        return identifier.getLanguage();
      }
    }
    return null;
  }
View Full Code Here

    if (text.length() < MIN_LENGTH_FOR_AUTO_DETECTION && fallbackLanguage != null) {
      print("Auto-detected language of text with length " + text.length() + " is not reasonably certain, using '" + fallbackLanguage + "' as fallback");
      return Language.getLanguageForShortName(fallbackLanguage);
    }
   
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(identifier.getLanguage());
    } catch (IllegalArgumentException e) {
      // fall back to English
      lang = Language.getLanguageForLocale(Locale.ENGLISH);
    }
    if (lang.getDefaultLanguageVariant() != null) {
View Full Code Here

    check.getAndIncrement();
    checkExecutor.schedule(new RunnableImpl(caller), 0, TimeUnit.MILLISECONDS);
  }

  Language autoDetectLanguage(String text) {
    final LanguageIdentifier langIdentifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(langIdentifier.getLanguage());
    } catch (IllegalArgumentException e) {
      lang = Language.getLanguageForLocale(Locale.getDefault());
    }
    if (lang.hasVariant()) {
      // UI only shows variants like "English (American)", not just "English", so use that:
View Full Code Here

    final String text = StringTools.readStream(new FileInputStream(filename), encoding);
    return detectLanguageOfString(text);
  }

  private static Language detectLanguageOfString(final String text) {
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    final Language lang = Language.getLanguageForShortName(identifier.getLanguage());
    return lang;
  }
View Full Code Here

     *
     * @throws IOException if there is an error when reading the text
     */
    @Test
    public void testLangId() throws IOException {
        LanguageIdentifier tc = new LanguageIdentifier(text);
        String language = tc.getLanguage();
        assertEquals("en", language);
    }
View Full Code Here

        // truncate text to some piece from the middle if probeLength > 0
        int checkLength = probeLength;
        if (checkLength > 0 && text.length() > checkLength) {
            text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
        }
        LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
        String language = languageIdentifier.getLanguage();
        log.info("language identified as " + language);

        // add language to metadata
        MGraph g = ci.getMetadata();
        ci.getLock().writeLock().lock();
View Full Code Here

TOP

Related Classes of org.apache.tika.language.LanguageIdentifier

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.