Package owlsmx.similaritymeasures

Source Code of owlsmx.similaritymeasures.CosineSimilarity

/*
* Created on 08.12.2004
*
*
* COPYRIGHT NOTICE
*
* Copyright (C) 2005 DFKI GmbH, Germany
* Developed by Benedikt Fries, Matthias Klusch
*
* The code is free for non-commercial use only.
* You can redistribute it and/or modify it under the terms
* of the Mozilla Public License version 1.1  as
* published by the Mozilla Foundation at
* http://www.mozilla.org/MPL/MPL-1.1.txt
*/
package owlsmx.similaritymeasures;

import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.mindswap.pellet.TuBox.NotUnfoldableException;

import owlsmx.Indexer.Index;
import owlsmx.Indexer.SimpleIndex;
import owlsmx.data.LocalOntologyContainer;
import owlsmx.exceptions.MatchingException;
import owlsmx.reasoning.PelletReasoner;
import owlsmx.tokenizer.PrimitiveConceptTokenizer;
import owlsmx.utils.CollectionUtils;
import owlsmx.utils.CosineResultTriple;
import owlsmx.utils.MathUtils;
import owlsmx.utils.StringUtils;

/**
* Implementation of the cosine similarity measure
*
* @author Benedikt Fries
*
*/
public class CosineSimilarity extends SimilarityMeasure {
    protected int term_frequency_component=SimilarityMeasure.TERMWEIGHT_LOGARITHMIC;
    PrimitiveConceptTokenizer tokenizer = new PrimitiveConceptTokenizer();
    protected boolean useIndex = true;
   
   
    /**
     *   Constructor
     *   Creates an index
     */
    public CosineSimilarity() {
        super(SimpleIndex.instanceOf());
    }
   
    /**
     * Constructor,
     * Uses given index
     * @param index index that should be used
     */
    public CosineSimilarity(Index index) {
        super(index);
    }
   
    /**
     * Constructor,
     * Uses the index of the given similarity Measure
     *
     * @param measure  measure whose index should be used
     */
    public CosineSimilarity(SimilarityMeasure measure) {
        super(measure);
    }
   
    /**
     * Updates the overall document frequencies with the primitive concepts of a concept
     * @param classname        document to be updated
     * @param primitiveConcepts    the primitive concepts of the concept/document
     */
    protected void updateDocumentFrequency(String classname, Map primitiveConcepts) {
        if (this.usesIndex())
            SimpleIndex.instanceOf().addDocument(classname, primitiveConcepts);
    }
   
    /**
     * Computes the binary term weight for given term frequencies
     *
     * @param termFrequencies  term frequencies to be used
     * @return          computed weighted TF
     */
    protected double[] binaryTermWeight(int[] termFrequencies) {
        double[] result = new double[termFrequencies.length];
        for (int i = 0; i<termFrequencies.length;i++) {
            if (termFrequencies[i]>0)
                result[i]=1.0;
            else
                result[i]=0.0;
        }
        return result;
    }
   
   
    /**
     * Computes the agmented normalized term weight for given term frequencies
     *
     * @param termFrequencies  term frequencies to be used
     * @return          computed weighted TF
     */
    protected double[] augmentedNormalizedTermFrequency(int[] termFrequencies) {
        double[] result = new double[termFrequencies.length];
        int max = 1;
        for (int i = 0; i<termFrequencies.length;i++) {
                if (termFrequencies[i]>max)
                    max=termFrequencies[i];
        }
        for (int i = 0; i<termFrequencies.length;i++) {
            result[i]=(0.5 + (0.5 * termFrequencies[i])/max);
        }
        return result;
    }
   
    /**
     * Computes the logarithmic term weight for given term frequencies
     *
     * @param termFrequencies  term frequencies to be used
     * @return          computed weighted TF
     */
    protected double[] logrithmicTermFrequency(int[] termFrequencies) {
        double[] result = new double[termFrequencies.length];
        for (int i = 0; i<termFrequencies.length;i++) {
            if (termFrequencies[i]<1)
                result[i]=0.0;
            else
                result[i]=( Math.log( (double) termFrequencies[i]) + 1.0 );                   
        }
        return result;
    }
  
    /**
     * Computes the relative term weight for given term frequencies
     *
     * @param termFrequencies  term frequencies to be used
     * @return          computed weighted TF
     */
    protected double[] relativeTermWeight(int[] termFrequencies) {
        double[] result = new double[termFrequencies.length];
        double sum = MathUtils.vectorSum(termFrequencies);
        if (sum!=0) {
            for (int i = 0; i<termFrequencies.length;i++) {
                result[i]=termFrequencies[i]/sum;
            }
        }
        return result;
    }
   
    /**
     * Applies a given IDF to the weighted term frequencies
     *
     * @param result  weighted TFs
     * @param idf    IDF to be used
     * @return      fully weighted TFs
     */
    protected double[] idf(double[] result, double[] idf) {
        for (int i = 0; i<result.length;i++) {
          result[i] = result[i]*idf[i];
      }
      return result;
    }
   
    /**
     * Weights and normalizes terms frequencies without using an IDF
     *
     * @param termFrequencies  term frequencies to be used
     * @return          weighted and normalized TFs
     */
    protected double[] weigthAndNormalizeTerms(int[] termFrequencies) {
      double[] idf = new double[termFrequencies.length];
        for (int i = 0; i<idf.length;i++) {
          idf[i] = 1.0;
      }
      return weigthAndNormalizeTerms(termFrequencies, idf);
    }
   
    /**
     * Weights and normalizes terms frequencies using an IDF
     *
     * @param idf        used inverse document frequency
     * @param termFrequencies  term frequencies to be used
     * @return          weighted and normalized TFs
     */
    protected double[] weigthAndNormalizeTerms(int[] termFrequencies, double[] idf) {
        double[] result = new double[termFrequencies.length];       
        switch(term_frequency_component) {
        case SimilarityMeasure.TERMWEIGHT_BINARY: 
//          Binary weight
            result = binaryTermWeight(termFrequencies);
            break;
        case SimilarityMeasure.TERMWEIGHT_TERMFREQUENCY:
//          Pure term frequency
            for (int i = 0; i<termFrequencies.length;i++) {
                result[i]=termFrequencies[i];
            }
            break;
        case SimilarityMeasure.TERMWEIGHT_AUGMENTED: 
//          Augmented normalized term frequency
            result = idf(augmentedNormalizedTermFrequency(termFrequencies),idf);
            break;
        case SimilarityMeasure.TERMWEIGHT_LOGARITHMIC: 
//          Logrithmic term frequency
            result = idf(logrithmicTermFrequency(termFrequencies),idf);
            break;
        case SimilarityMeasure.TERMWEIGHT_RELATIVE: 
//          relative importance
            result = relativeTermWeight(termFrequencies);
            break;
        }
       
        return result;
    }
   
    /**
     * Computes TF and IDF of terms that are in both documents
     *
     * @param pc1  Map with TFs of document 1
     * @param pc2  Map with TFs of document 2
     * @return    CosineResultTriple which contains TFs of documen 1, TFs of document 2, IDF of the used terms
     * @throws MatchingException  If something goes wrong
     */
    protected CosineResultTriple getTFArrays(Map pc1, Map pc2) throws MatchingException {
        Set size = CollectionUtils.union(pc1.keySet(),pc2.keySet());
        int[] r1 = new int[size.size()];
        int[] r2 = new int[size.size()];
        double[] idf = new double[size.size()];
       
        Iterator iter = size.iterator();
        String current;
        int count = 0;
        //System.out.println("Unweighted term:");
        while (iter.hasNext()) {
            current = (String) iter.next();
           
            if ( (!pc1.containsKey(current)) && (!pc2.containsKey(current)))
              owlsmx.io.ErrorLog.instanceOf().report("Problem with current: " +  current);
           
            if (pc1.containsKey(current))
                r1[count]=((Integer)pc1.get(current)).intValue();
            else
                r1[count]=0;
           
            if (pc2.containsKey(current))
                r2[count]=((Integer)pc2.get(current)).intValue();
            else
                r2[count]=0;
            if ( (r1[count]==0) && (r2[count]==0))
               owlsmx.io.ErrorLog.instanceOf().report("A new Problem with current: " +  current);
            //if (this.usesIndex())
                idf[count] = index.getIDF(current);
            //else
            //    idf[count] = 1;
            //System.out.println("           r1: " + r1[count] + "  r2: " + r2[count] + " idf " + idf[count]);
            count++;
        }
       
      return new CosineResultTriple(r1,r2,idf);
    }
  
    /* (non-Javadoc)
     * @see owlsmx.similaritymeasures.SimilarityMeasureInterface#updateDocument(java.lang.String, java.lang.String)
     */
    public void updateDocument(String document, String tokens) {
        this.updateDocumentFrequency(document,StringUtils.getPrimitiveConcepts(tokens));
    }
   
    /* (non-Javadoc)
     * @see owlsmx.similaritymeasures.SimilarityMeasureInterface#computeSimilarity(java.lang.String, java.lang.String, java.lang.String, java.lang.String)
     */
    public double computeSimilarity(String query, String token1, String service, String token2) throws MatchingException {
        Map pc1 = tokenizer.getTokenFrequencies(token1);
        Map pc2 = tokenizer.getTokenFrequencies(token2);       
       
        CosineResultTriple TFs = getTFArrays(pc1, pc2);
        double[] weightedPC1 =  weigthAndNormalizeTerms(TFs.term1);           
        double[] weightedPC2 =  weigthAndNormalizeTerms(TFs.term2,TFs.idf);
       
        if ( (weightedPC1.length==0) && (weightedPC2.length==0) ) {
            return 1.0;
        }               
        else if ( (weightedPC1.length==0) || (weightedPC2.length==0) ) {
            return 0;
        }       
        return (MathUtils.vectorDotProduct(weightedPC1,weightedPC2)/(MathUtils.vectorNorm(weightedPC1) * MathUtils.vectorNorm(weightedPC2) ) );
    }
       
    /**
     * Change the used weighting
     *
     * @param type  Desired weighting method
     */
    public void setWeigth(int type) {
        this.term_frequency_component=type;
    }

    /**
     * @return currently used weighting method
     */
    public int getWeigth() {
        return term_frequency_component;
    }

    /* (non-Javadoc)
     * @see owlsmx.similaritymeasures.SimilarityMeasureInterface#computeSimilarity(owlsmx.reasoning.PelletReasoner, owlsmx.data.LocalOntologyContainer, java.lang.String, java.lang.String)
     */
    public double computeSimilarity(PelletReasoner reason, LocalOntologyContainer localOntology, String clazz1, String clazz2) {
      try {
      return computeSimilarity(clazz1.toString(), unfoldTerm(reason, localOntology, clazz1), clazz2.toString(), unfoldTerm(reason,localOntology, clazz2));
    } catch (NotUnfoldableException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (MatchingException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (URISyntaxException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return 0.0;
    }

    /* (non-Javadoc)
     * @see owlsmx.similaritymeasures.SimilarityMeasure#getSimilarityType()
     */
    public short getSimilarityType() {
        return SimilarityMeasure.SIMILARITY_COSINE;
    }





}
TOP

Related Classes of owlsmx.similaritymeasures.CosineSimilarity

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.