Package org.apache.tika.language

Examples of org.apache.tika.language.LanguageIdentifier


       
        result = getFirstLanguage(result);
       
        if (result == null) {
            // Language is still unspecified, so use ProfileHandler's result
            LanguageIdentifier langIdentifier = profilingHandler.getLanguage();
            // FUTURE KKr - provide config for specifying required certainty level.
            if (langIdentifier.isReasonablyCertain()) {
                result = langIdentifier.getLanguage();
                LOGGER.trace("Using language specified by profiling handler: " + result);
            } else {
                result = "";
            }
View Full Code Here


    String content = textHandler.toString();

    if (languageDetection) {
      String languageCode = bean.getString("languagecode");
      if (languageCode == null || languageCode.equals("")) {
        LanguageIdentifier identifier = new LanguageIdentifier(content);
        String lang = identifier.getLanguage();
        if (identifier.isReasonablyCertain() && (allowedLanguages == null || allowedLanguages.contains(lang))) {
          bean.set("languagecode", lang);
        }
      }
    }
View Full Code Here

  @Test
  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              Assert.assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          Assert.assertEquals(tokens[1], lang);
        }
      }
View Full Code Here

    final String text = StringTools.readFile(new FileInputStream(filename), encoding);
    return detectLanguageOfString(text);
  }

  private static Language detectLanguageOfString(final String text) {
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    final Language lang = Language.getLanguageForShortName(identifier.getLanguage());
    return lang;
  }
View Full Code Here

    if (text.length() < MIN_LENGTH_FOR_AUTO_DETECTION && fallbackLanguage != null) {
      print("Auto-detected language of text with length " + text.length() + " is not reasonably certain, using '" + fallbackLanguage + "' as fallback");
      return Language.getLanguageForShortName(fallbackLanguage);
    }
   
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(identifier.getLanguage());
    } catch (IllegalArgumentException e) {
      // fall back to English
      lang = Language.getLanguageForLocale(Locale.ENGLISH);
    }
    if (lang.getDefaultVariant() != null) {
View Full Code Here

      httpServer = null;
    }
  }

  private Language autoDetectLanguage(String text) {
    final LanguageIdentifier langIdentifier = new LanguageIdentifier(text);
    Language lang;
    try {
      lang = Language.getLanguageForShortName(langIdentifier.getLanguage());
    } catch (IllegalArgumentException e) {
      lang = Language.getLanguageForLocale(Locale.getDefault());
    }
    if (lang.hasVariant()) {
      // UI only shows variants like "English (American)", not just "English", so use that:
View Full Code Here

  @Test
  public void testLanguageIndentifier() {
    try {
      long total = 0;
      LanguageIdentifier identifier;
      BufferedReader in = new BufferedReader(new InputStreamReader(this
          .getClass().getResourceAsStream("test-referencial.txt")));
      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = line.split(";");
        if (!tokens[0].equals("")) {
          StringBuilder content = new StringBuilder();
          // Test each line of the file...
          BufferedReader testFile = new BufferedReader(new InputStreamReader(
              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
          String testLine = null, lang = null;
          while ((testLine = testFile.readLine()) != null) {
            content.append(testLine + "\n");
            testLine = testLine.trim();
            if (testLine.length() > 256) {
              identifier = new LanguageIdentifier(testLine);
              lang = identifier.getLanguage();
              assertEquals(tokens[1], lang);
            }
          }
          testFile.close();

          // Test the whole file
          long start = System.currentTimeMillis();
          System.out.println(content.toString());
          identifier = new LanguageIdentifier(content.toString());
          lang = identifier.getLanguage();
          System.out.println(lang);
          total += System.currentTimeMillis() - start;
          assertEquals(tokens[1], lang);
        }
      }
View Full Code Here

      String content = parse.getText();
      if (content != null) {
       text.append(" ").append(content.toString());
      }

      LanguageIdentifier identifier = new LanguageIdentifier(text.toString());

      if (onlyCertain) {
        if (identifier.isReasonablyCertain()) {
          return identifier.getLanguage();
        }
      } else {
        return identifier.getLanguage();
      }
    }
    return null;
  }
View Full Code Here

  }

  private void extractLanguage(JCas plainTextView) {
    try {
      LanguageIdentifier li = new LanguageIdentifier(new LanguageProfile(plainTextView.getDocumentText()));
      if (li.getLanguage() != null && !"".equals(li.getLanguage()))
        plainTextView.setDocumentLanguage(li.getLanguage());
    }
    catch (Exception e) {
      this.getContext().getLogger().log(Level.WARNING, new StringBuffer("Could not extract language due to ")
              .append(e.getLocalizedMessage()).toString());
    }
View Full Code Here

        if (langDetect) {
            try {
                if (language != null) {
                    metadata.add(Metadata.CONTENT_LANGUAGE, language);
                } else {
                    LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
                    language = identifier.getLanguage();
                }
                context = context.createExternalValueContext(language);
                languageMapper.parse(context);
            } catch(Throwable t) {
                logger.debug("Cannot detect language: [{}]", t.getMessage());
View Full Code Here

TOP

Related Classes of org.apache.tika.language.LanguageIdentifier

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.