Examples of LanguageIdentifier


Examples of org.apache.tika.language.LanguageIdentifier

      String content = parse.getText();
      if (content != null) {
       text.append(" ").append(content.toString());
      }

      LanguageIdentifier identifier = new LanguageIdentifier(text.toString());

      if (onlyCertain) {
        if (identifier.isReasonablyCertain()) {
          return identifier.getLanguage();
        }
      } else {
        return identifier.getLanguage();
      }
    }
    return null;
  }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

    if (text.length() < MIN_LENGTH_FOR_AUTO_DETECTION && fallbackLanguage != null) {
      print("Auto-detected language of text with length " + text.length() + " is not reasonably certain, using '" + fallbackLanguage + "' as fallback");
      return Language.getLanguageForShortName(fallbackLanguage);
    }
   
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(identifier.getLanguage());
    } catch (IllegalArgumentException e) {
      // fall back to English
      lang = Language.getLanguageForLocale(Locale.ENGLISH);
    }
    if (lang.getDefaultLanguageVariant() != null) {
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

    check.getAndIncrement();
    checkExecutor.schedule(new RunnableImpl(caller), 0, TimeUnit.MILLISECONDS);
  }

  Language autoDetectLanguage(String text) {
    final LanguageIdentifier langIdentifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(langIdentifier.getLanguage());
    } catch (IllegalArgumentException e) {
      lang = Language.getLanguageForLocale(Locale.getDefault());
    }
    if (lang.hasVariant()) {
      // UI only shows variants like "English (American)", not just "English", so use that:
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

    final String text = StringTools.readStream(new FileInputStream(filename), encoding);
    return detectLanguageOfString(text);
  }

  private static Language detectLanguageOfString(final String text) {
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    final Language lang = Language.getLanguageForShortName(identifier.getLanguage());
    return lang;
  }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

     *
     * @throws IOException if there is an error when reading the text
     */
    @Test
    public void testLangId() throws IOException {
        LanguageIdentifier tc = new LanguageIdentifier(text);
        String language = tc.getLanguage();
        assertEquals("en", language);
    }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

        // truncate text to some piece from the middle if probeLength > 0
        int checkLength = probeLength;
        if (checkLength > 0 && text.length() > checkLength) {
            text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
        }
        LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
        String language = languageIdentifier.getLanguage();
        log.info("language identified as " + language);

        // add language to metadata
        MGraph g = ci.getMetadata();
        ci.getLock().writeLock().lock();
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

     *
     * @throws IOException if there is an error when reading the text
     */
    @Test
    public void testLangId() throws IOException {
        LanguageIdentifier tc = new LanguageIdentifier(text);
        String language = tc.getLanguage();
        assertEquals("en", language);
    }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

    /**
     * @return the identified language of the content
     */
    public String getLanguage() {
        return new LanguageIdentifier(content).getLanguage();
    }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

  @Test
  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          assertEquals(tokens[1], lang);
        }
      }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

        InputStream in = this.getClass().getClassLoader().getResourceAsStream(
                testFileName);
        assertNotNull("failed to load resource " + testFileName, in);

        String text = IOUtils.toString(in);
        LanguageIdentifier tc = new LanguageIdentifier(text);
        String language = tc.getLanguage();
        assertEquals("en", language);
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.