Examples of net.yacy.document.WordTokenizer

net.yacy.document.WordTokenizer


                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<String> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);

View Full Code Here

      final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
      final StringBuilder buffer = new StringBuilder(bufferSize);
      buffer.append(document.dc_title());
      buffer.append(document.dc_description());
      buffer.append(document.dc_subject(' '));
      final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);


      int count = 0;


      // loop through potential tag and rank them
      while(tokens.hasMoreElements()) {
        count = 0;
        token = tokens.nextElement();


        /*
        pair.delete(0, pair.indexOf(SPACE)+1);
        if(pair.length() > 1)
          pair.append(SPACE);

View Full Code Here


                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<StringBuilder> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);

View Full Code Here

      final StringBuilder buffer = new StringBuilder(bufferSize);
      final StringBuilder pwords = new StringBuilder(1000);
      buffer.append(document.dc_title().toLowerCase());
      buffer.append(document.dc_description().toLowerCase());
      buffer.append(document.dc_subject(' ').toLowerCase());
      final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
      int score = 0;
      
      // get phrases
      final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
      phrases.putAll(getPhrases(document, 3));
      final Iterator<String> iter = phrases.keySet().iterator();
      while(iter.hasNext()) {
        score = 10;
        final String phrase = iter.next();              
        if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
          score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
        }
        if(isDigitSpace(phrase)) {
          score = 10;
        }
        if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {          
          score = score * 10;
        }
        if (tags.containsKey(phrase)) {
          score = score * 20;
        }
        topwords.add(new YMarkTag(phrase, score));
        pwords.append(phrase);
        pwords.append(' ');
      }
      
      // loop through potential tag and rank them
      while(tokens.hasMoreElements()) {        
        score = 0;
        token = tokens.nextElement();
        
        // check if the token appears in the text
        if (words.containsKey(token.toString())) {          
          final Word word = words.get(token.toString());
          // token appears in text and matches an existing bookmark tag

View Full Code Here

  }  
  
  private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
    final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
    final StringBuilder phrase = new StringBuilder(128);
    final Enumeration<StringBuilder> tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
    StringBuilder token;
    int count = 0;
    
    // loop through text
    while(tokens.hasMoreElements()) {        


      token = tokens.nextElement();      
      if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
        continue;      
      
      // if we have a full phrase, delete the first token
      count++;

View Full Code Here

TOP

Related Classes of net.yacy.document.WordTokenizer

de.anomic.data.ymark.YMarkAutoTagger

ViewFile

java.io.ByteArrayInputStream

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.