Package net.yacy.document

Examples of net.yacy.document.WordTokenizer


                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<String> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);
View Full Code Here


      final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
      final StringBuilder buffer = new StringBuilder(bufferSize);
      buffer.append(document.dc_title());
      buffer.append(document.dc_description());
      buffer.append(document.dc_subject(' '));
      final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);

      int count = 0;

      // loop through potential tag and rank them
      while(tokens.hasMoreElements()) {
        count = 0;
        token = tokens.nextElement();

        /*
        pair.delete(0, pair.indexOf(SPACE)+1);
        if(pair.length() > 1)
          pair.append(SPACE);
View Full Code Here

                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<StringBuilder> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);
View Full Code Here

      final StringBuilder buffer = new StringBuilder(bufferSize);
      final StringBuilder pwords = new StringBuilder(1000);
      buffer.append(document.dc_title().toLowerCase());
      buffer.append(document.dc_description().toLowerCase());
      buffer.append(document.dc_subject(' ').toLowerCase());
      final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
      int score = 0;
     
      // get phrases
      final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
      phrases.putAll(getPhrases(document, 3));
      final Iterator<String> iter = phrases.keySet().iterator();
      while(iter.hasNext()) {
        score = 10;
        final String phrase = iter.next();             
        if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
          score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
        }
        if(isDigitSpace(phrase)) {
          score = 10;
        }
        if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {         
          score = score * 10;
        }
        if (tags.containsKey(phrase)) {
          score = score * 20;
        }
        topwords.add(new YMarkTag(phrase, score));
        pwords.append(phrase);
        pwords.append(' ');
      }
     
      // loop through potential tag and rank them
      while(tokens.hasMoreElements()) {       
        score = 0;
        token = tokens.nextElement();
       
        // check if the token appears in the text
        if (words.containsKey(token.toString())) {         
          final Word word = words.get(token.toString());
          // token appears in text and matches an existing bookmark tag
View Full Code Here

 
 
  private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
    final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
    final StringBuilder phrase = new StringBuilder(128);
    final Enumeration<StringBuilder> tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
    StringBuilder token;
    int count = 0;
   
    // loop through text
    while(tokens.hasMoreElements()) {       

      token = tokens.nextElement();     
      if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
        continue;     
     
      // if we have a full phrase, delete the first token
      count++;
View Full Code Here

TOP

Related Classes of net.yacy.document.WordTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.