Package de.anomic.data.ymark

Source Code of de.anomic.data.ymark.YMarkAutoTagger

package de.anomic.data.ymark;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser.Failure;
import net.yacy.document.WordTokenizer;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;

public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler {

  public final static String SPACE = " ";
  public final static String POISON = "";
  public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo",
      "and", "with", "the", "gt", "lt"));


  private final ArrayBlockingQueue<String> bmkQueue;
  private final YMarkTables ymarks;
  private final String bmk_user;
  private final LoaderDispatcher loader;

  private final boolean merge;

  public YMarkAutoTagger(final ArrayBlockingQueue<String> bmkQueue, final LoaderDispatcher loader, final YMarkTables ymarks, final String bmk_user, final boolean merge) {
    this.bmkQueue = bmkQueue;
    this.ymarks = ymarks;
    this.bmk_user = bmk_user;
    this.loader = loader;
    this.merge = merge;
  }

  public YMarkAutoTagger(final LoaderDispatcher loader, final YMarkTables ymarks, final String bmk_user) {
    this.bmkQueue = new ArrayBlockingQueue<String>(1);
    this.ymarks = ymarks;
    this.bmk_user = bmk_user;
    this.loader = loader;
    this.merge = true;
  }

  private static Document loadDocument(final String url, final LoaderDispatcher loader) {
    DigestURI uri;
    Response response;
    try {
      uri = new DigestURI(url);
    } catch (final MalformedURLException e) {
      Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url);
      return null;
    }
    try {
      response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true);
    } catch (final IOException e) {
      Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url);
      return null;
    }
    try {
      return Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
    } catch (final Failure e) {
      Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to a parser failure for url: "+url);
      return null;
    }
  }
 
  public static String autoTag(final Document document, final int max, final TreeMap<String, YMarkTag> tags) {
    final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>();
    StringBuilder token;

    if(document != null) {

      //get words from document
      final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
     
      // generate potential tags from document title, description and subject
      final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
      final StringBuilder buffer = new StringBuilder(bufferSize);
      final StringBuilder pwords = new StringBuilder(1000);
      buffer.append(document.dc_title().toLowerCase());
      buffer.append(document.dc_description().toLowerCase());
      buffer.append(document.dc_subject(' ').toLowerCase());
      final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
      int score = 0;
     
      // get phrases
      final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
      phrases.putAll(getPhrases(document, 3));
      final Iterator<String> iter = phrases.keySet().iterator();
      while(iter.hasNext()) {
        score = 10;
        final String phrase = iter.next();             
        if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
          score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
        }
        if(isDigitSpace(phrase)) {
          score = 10;
        }
        if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {         
          score = score * 10;
        }
        if (tags.containsKey(phrase)) {
          score = score * 20;
        }
        topwords.add(new YMarkTag(phrase, score));
        pwords.append(phrase);
        pwords.append(' ');
      }
     
      // loop through potential tag and rank them
      while(tokens.hasMoreElements()) {       
        score = 0;
        token = tokens.nextElement();
       
        // check if the token appears in the text
        if (words.containsKey(token.toString())) {         
          final Word word = words.get(token.toString());
          // token appears in text and matches an existing bookmark tag
          if (tags.containsKey(token.toString())) {
            score = word.occurrences() * tags.get(token.toString()).size() * 200;
          }
          // token appears in text and has more than 3 characters
          else if (token.length()>3) {
            score = word.occurrences() * 100;
          }
          // if token is already part of a phrase, reduce score
          if(pwords.toString().indexOf(token.toString())>1) {
            score = score / 3;
          }
          topwords.add(new YMarkTag(token.toString(), score));
        }
      }
      score = 0;
      buffer.setLength(0);
      for(final YMarkTag tag : topwords) {
        if(score < max) {
          if(tag.size() > 100) {
            buffer.append(tag.name());
            buffer.append(YMarkUtil.TAGS_SEPARATOR);
            score++;
          }
        } else {
          break;
        }
      }
      final String clean =  YMarkUtil.cleanTagsString(buffer.toString());
      if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) {
        return document.getFileExtension();
      }
      return clean;
    }
    return new String();   
 
 
  private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
    final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
    final StringBuilder phrase = new StringBuilder(128);
    final Enumeration<StringBuilder> tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
    StringBuilder token;
    int count = 0;
   
    // loop through text
    while(tokens.hasMoreElements()) {       

      token = tokens.nextElement();     
      if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
        continue;     
     
      // if we have a full phrase, delete the first token
      count++;
      if(count > size)
        phrase.delete(0, phrase.indexOf(SPACE)+1);
     
      // append new token
      if(phrase.length() > 1)
        phrase.append(SPACE);           
      phrase.append(token);

      if(count >= size) {  // make sure we really have a phrase
        if(phrases.containsKey(phrase.toString())) {
          phrases.get(phrase.toString()).inc();
        } else {
          phrases.put(phrase.toString(), new YMarkTag(phrase.toString()));
        }
      }   
    }
   
    return phrases;
  }

  public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) {
    final Document document = loadDocument(url, loader);
    if (document != null)
      return autoTag(document, max, tags);
    else
      return "/IOExceptions";
  }
 
  public static boolean isDigitSpace(String str) {
    if (str == null) {
      return false;
      }
      int sz = str.length();
      for (int i = 0; i < sz; i++) {
        if ((Character.isDigit(str.charAt(i)) == false) && (str.charAt(i) != ' ')) {
          return false;
        }
      }
      return true;
  }

  public void run() {
    Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger run()");
    Thread.currentThread().setUncaughtExceptionHandler(this);
    String url = null;
    String tagString;
    Iterator<String> tit;
    try {
      final TreeMap<String, YMarkTag> tags = this.ymarks.getTags(this.bmk_user);
      Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger queue size: "+this.bmkQueue.size());
      while((url = this.bmkQueue.take()) != POISON) {
        tagString = autoTag(url, this.loader, 5, tags);
        if (tagString.equals("/IOExceptions")) {
          this.ymarks.addFolder(bmk_user, url, tagString);
          tagString = "";
        }       
        // update tags
        this.ymarks.addTags(this.bmk_user, url, tagString, this.merge);

        // update tags
        tit = YMarkUtil.keysStringToSet(tagString).iterator();
        while(tit.hasNext()) {
            final String tag = tit.next();
          if(tags.containsKey(tag)) {
              tags.get(tag).inc();
            } else {
              tags.put(tag, new YMarkTag(tag));
            }
        }
      }
      Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger has been poisoned");
    } catch (final InterruptedException e) {
      Log.logException(e);
    } catch (final IOException e) {
      Log.logWarning(YMarkTables.BOOKMARKS_LOG.toString(), "autoTagger - IOException for URL: "+url);
    } catch (final RowSpaceExceededException e) {
      Log.logException(e);
    } finally {
    }
  }

  public void uncaughtException(final Thread t, final Throwable e) {
    Log.logWarning(YMarkTables.BOOKMARKS_LOG, "I caught an uncaughtException in thread "+t.getName());
    Log.logException(e);
  }
}
TOP

Related Classes of de.anomic.data.ymark.YMarkAutoTagger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.