Examples of LanguageIdentifier


Examples of org.apache.tika.language.LanguageIdentifier

    if (text.length() < MIN_LENGTH_FOR_AUTO_DETECTION && fallbackLanguage != null) {
      print("Auto-detected language of text with length " + text.length() + " is not reasonably certain, using '" + fallbackLanguage + "' as fallback");
      return Language.getLanguageForShortName(fallbackLanguage);
    }
   
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(identifier.getLanguage());
    } catch (IllegalArgumentException e) {
      // fall back to English
      lang = Language.getLanguageForLocale(Locale.ENGLISH);
    }
    if (lang.getDefaultVariant() != null) {
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

      httpServer = null;
    }
  }

  private Language autoDetectLanguage(String text) {
    final LanguageIdentifier langIdentifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(langIdentifier.getLanguage());
    } catch (IllegalArgumentException e) {
      lang = Language.getLanguageForLocale(Locale.getDefault());
    }
    if (lang.hasVariant()) {
      // UI only shows variants like "English (American)", not just "English", so use that:
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

  @Test
  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          assertEquals(tokens[1], lang);
        }
      }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

      String content = parse.getText();
      if (content != null) {
       text.append(" ").append(content.toString());
      }

      LanguageIdentifier identifier = new LanguageIdentifier(text.toString());

      if (onlyCertain) {
        if (identifier.isReasonablyCertain()) {
          return identifier.getLanguage();
        }
      } else {
        return identifier.getLanguage();
      }
    }
    return null;
  }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

  }

  private void extractLanguage(JCas plainTextView) {
    try {
      LanguageIdentifier li = new LanguageIdentifier(new LanguageProfile(plainTextView.getDocumentText()));
      if (li.getLanguage() != null && !"".equals(li.getLanguage()))
        plainTextView.setDocumentLanguage(li.getLanguage());
    }
    catch (Exception e) {
      this.getContext().getLogger().log(Level.WARNING, new StringBuffer("Could not extract language due to ")
              .append(e.getLocalizedMessage()).toString());
    }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

        if (langDetect) {
            try {
                if (language != null) {
                    metadata.add(Metadata.CONTENT_LANGUAGE, language);
                } else {
                    LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
                    language = identifier.getLanguage();
                }
                context = context.createExternalValueContext(language);
                languageMapper.parse(context);
            } catch(Throwable t) {
                logger.debug("Cannot detect language: [{}]", t.getMessage());
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

   
    public void map(Text header, Text visibleText, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException {
  
      try {
       
        String language = new LanguageIdentifier(visibleText.toString()).getLanguage();
        reporter.getCounter("FilterEnglish.language", language).increment(1);       
       
        if ("en".equals(language)) {
          collector.collect(header, visibleText);
        }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

        if (this.contentMaxlength != -1
                && titleandcontent.length() > this.contentMaxlength)
            titleandcontent = titleandcontent.substring(0, contentMaxlength);

        LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

        if (onlyCertain) {
            if (identifier.isReasonablyCertain())
                return identifier.getLanguage();
            else
                return null;
        }
        return identifier.getLanguage();
    }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

 

  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          assertEquals(tokens[1], lang);
        }
      }
View Full Code Here

Examples of org.apache.tika.language.LanguageIdentifier

  }

  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          assertEquals(tokens[1], lang);
        }
      }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.