Examples of net.yacy.kelondro.data.word.Word

net.yacy.kelondro.data.word.Word

                                urlModified.getTime(),
                                System.currentTimeMillis(),
                                UTF8.getBytes(language),
                                doctype,
                                outlinksSame, outlinksOther);
        Word wprop;
        byte[] wordhash;
        while (i.hasNext()) {
            wentry = i.next();
            word = wentry.getKey();
            wprop = wentry.getValue();

View Full Code Here

        }
        */


        // check if the token appears in the text
        if (words.containsKey(token)) {
          final Word word = words.get(token);
          // token appears in text and matches an existing bookmark tag
          if (tags.containsKey(token)) {
            count = word.occurrences() * tags.get(token).size() * 100;
          }
          // token appears in text and has more than 3 characters
          if (token.length()>3) {
            count = word.occurrences() * 100;
          }
          topwords.add(new YMarkTag(token, count));
        }
      }
      count = 0;

View Full Code Here

  public YMarkWordCountComparator(final Map<String,Word> words) {
    this.words = words;
  }
  
  public int compare(final String k1, final String k2) {
    final Word w1 = this.words.get(k1);
    final Word w2 = this.words.get(k2);
    
        if(w1.occurrences() > w2.occurrences())
            return 1;
        else if(w1.occurrences() < w2.occurrences())
            return -1;
        else
            return 0; 
  }

View Full Code Here

                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
            }
        
            // finally check all words for missing flag entry
            final Iterator<Map.Entry<String, Word>> k = words.entrySet().iterator();
            Word wprop;
            Map.Entry<String, Word> we;
            while (k.hasNext()) {
                we = k.next();
                wprop = we.getValue();
                if (wprop.flags == null) {

View Full Code Here

            final Bitfield flagstemplate,
            final boolean useForLanguageIdentification,
            final WordCache meaningLib) {
        if (text == null) return;
        String word;
        Word wprop;
        WordTokenizer wordenum;
        wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
        int pip = 0;
        while (wordenum.hasMoreElements()) {
            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
            if (useForLanguageIdentification) languageIdentificator.add(word);
            if (word.length() < 2) continue;
            wprop = words.get(word);
            if (wprop == null) wprop = new Word(0, pip, phrase);
            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
            wprop.flags.set(flagpos, true);
            words.put(word, wprop);
            pip++;
            this.RESULT_NUMB_WORDS++;

View Full Code Here

        assert is != null;
        final Set<String> currsentwords = new HashSet<String>();
        String word = "";
        String k;
        int wordlen;
        Word wsp, wsp1;
        int wordHandle;
        int wordHandleCount = 0;
        int sentenceHandleCount = 0;
        int allwordcounter = 0;
        int allsentencecounter = 0;
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
        
        // read source
        final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
        while (wordenum.hasMoreElements()) {
            word = wordenum.nextElement().toLowerCase(Locale.ENGLISH);
            if (languageIdentificator != null) languageIdentificator.add(word);
            if (word.length() < wordminsize) continue;
            
            // distinguish punctuation and words
            wordlen = word.length();
            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
                // store sentence
                currsentwords.clear();
                wordInSentenceCounter = 1;
            } else {
                // check index.of detection
                if (last_last && comb_indexof && word.equals("modified")) {
                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
                    wordenum.pre(true); // parse lines as they come with CRLF
                }
                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
                last_last = word.equals("last");
                last_index = word.equals("index");
                
                // store word
                allwordcounter++;
                currsentwords.add(word);
                wsp = words.get(word);
                if (wsp != null) {
                    // word already exists
                    wordHandle = wsp.posInText;
                    wsp.inc();
                } else {
                    // word does not yet exist, create new word entry
                    wordHandle = wordHandleCount++;
                    wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
                    wsp.flags = RESULT_FLAGS.clone();
                    words.put(word, wsp);
                }
                // we now have the unique handle of the word, put it into the sentence:
                wordInSentenceCounter++;

View Full Code Here

        score = 0;
        token = tokens.nextElement();
        
        // check if the token appears in the text
        if (words.containsKey(token.toString())) {          
          final Word word = words.get(token.toString());
          // token appears in text and matches an existing bookmark tag
          if (tags.containsKey(token.toString())) {
            score = word.occurrences() * tags.get(token.toString()).size() * 200;
          }
          // token appears in text and has more than 3 characters
          else if (token.length()>3) {
            score = word.occurrences() * 100;
          }
          // if token is already part of a phrase, reduce score
          if(pwords.toString().indexOf(token.toString())>1) {
            score = score / 3;
          }

View Full Code Here

                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
            }


            // finally check all words for missing flag entry
            final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
            Word wprop;
            Map.Entry<String, Word> we;
            while (k.hasNext()) {
                we = k.next();
                wprop = we.getValue();
                if (wprop.flags == null) {

View Full Code Here

            final Bitfield flagstemplate,
            final boolean useForLanguageIdentification,
            final WordCache meaningLib) {
        if (text == null) return;
        String word;
        Word wprop;
        WordTokenizer wordenum;
        wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
        int pip = 0;
        while (wordenum.hasMoreElements()) {
            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
            if (useForLanguageIdentification) this.languageIdentificator.add(word);
            if (word.length() < 2) continue;
            wprop = this.words.get(word);
            if (wprop == null) wprop = new Word(0, pip, phrase);
            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
            wprop.flags.set(flagpos, true);
            this.words.put(word, wprop);
            pip++;
            this.RESULT_NUMB_WORDS++;

View Full Code Here

        assert is != null;
        final Set<String> currsentwords = new HashSet<String>();
        String word = "";
        String k;
        int wordlen;
        Word wsp;
        final Word wsp1;
        int wordHandle;
        int wordHandleCount = 0;
        final int sentenceHandleCount = 0;
        int allwordcounter = 0;
        final int allsentencecounter = 0;
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);


        // read source
        final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
        while (wordenum.hasMoreElements()) {
            word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
            if (this.languageIdentificator != null) this.languageIdentificator.add(word);
            if (word.length() < wordminsize) continue;


            // distinguish punctuation and words
            wordlen = word.length();
            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
                // store sentence
                currsentwords.clear();
                wordInSentenceCounter = 1;
            } else {
                // check index.of detection
                if (last_last && comb_indexof && word.equals("modified")) {
                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
                    wordenum.pre(true); // parse lines as they come with CRLF
                }
                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
                last_last = word.equals("last");
                last_index = word.equals("index");


                // store word
                allwordcounter++;
                currsentwords.add(word);
                wsp = this.words.get(word);
                if (wsp != null) {
                    // word already exists
                    wordHandle = wsp.posInText;
                    wsp.inc();
                } else {
                    // word does not yet exist, create new word entry
                    wordHandle = wordHandleCount++;
                    wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
                    wsp.flags = this.RESULT_FLAGS.clone();
                    this.words.put(word, wsp);
                }
                // we now have the unique handle of the word, put it into the sentence:
                wordInSentenceCounter++;

View Full Code Here

0 1

TOP

Related Classes of net.yacy.kelondro.data.word.Word

de.anomic.data.ymark.YMarkAutoTagger

de.anomic.data.ymark.YMarkWordCountComparator

de.anomic.search.Segment

net.yacy.document.Condenser

net.yacy.kelondro.index.HandleSet

net.yacy.search.index.Segment

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.