Package org.fnlp.nlp.duplicate

Source Code of org.fnlp.nlp.duplicate.Similarity

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.nlp.duplicate;

import gnu.trove.iterator.TIntIntIterator;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.fnlp.nlp.duplicate.FingerPrint.Type;


/**
* 计算两两相似度
*/
public class Similarity {
  /**
   * 相似度矩阵<a,<b,sim>>
   */
  public TIntIntHashMap[] similarityMap;
  public TreeSet<DocSim> dsMap;
  public double thres;
  private static final double lenDiffThresh = 0.3;
  public TreeMap<String, TIntArrayList> locationMap;
  public Type type;
  public int[] featureLen;
  public ArrayList<String> docs;
  public int capacity=100;
  private int numThreads;


  public Similarity(int numThreads, Type type) {
    this.type = type;
    this.numThreads = numThreads;
    thres = 0.7;

  }


  public void feature(){
    locationMap = new TreeMap<String, TIntArrayList>();
    Set<String> set;
    featureLen = new int[docs.size()];
    for(int i=1;i<docs.size();i++){
      set = FingerPrint.featureset(docs.get(i),type);

      featureLen[i]=set.size();
      Object[] sa =  set.toArray();     
      for(int j = 0; j < sa.length; j++) {
        TIntArrayList al = locationMap.get((String)sa[j]);
        if(al == null) {
          al = new TIntArrayList();
          al.add(i);
          locationMap.put((String)sa[j], al);
        }
        else
          al.add(i);
      }
    }
  }


  public  void duplicate(ArrayList<String> docs) throws Exception {
    this.docs = docs;
    dsMap = new TreeSet<DocSim>();
    feature();

    similarity();
    System.out.println("去重复");

    boolean[] dup = new boolean[docs.size()];
    for(int id1=0;id1<docs.size();id1++){
      if(dup[id1]||similarityMap[id1]==null)
        continue;


      ArrayList<Integer> ids = new ArrayList<Integer>();
      ids.add(id1);
      TIntIntIterator it = similarityMap[id1].iterator();
      for ( int i = similarityMap[id1].size(); i-- > 0; ) {
        it.advance();
        int id2 = it.key();       
        double sim = ((double)(it.value()* 2)) / (featureLen[id1] + featureLen[id2]);
        if(sim > thres ){
          dup[id2]= true;
          ids.add(id2);
        }
      }
      DocSim docSim = new DocSim(ids);
      dsMap.add(docSim);
    }
  }

  public  void printDocSim() {
    Iterator<DocSim> iter = dsMap.iterator();
    while (iter.hasNext()) {
      System.out.println(iter.next().toString());
    }
  }

  public  void similarity() throws InterruptedException
    System.out.println("相似度");
    ThreadPoolExecutor pool = new ThreadPoolExecutor(numThreads, numThreads, 1000,
        TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(capacity ));
    similarityMap = new TIntIntHashMap[docs.size()];
   
    Iterator<Entry<String, TIntArrayList>> iterator =
        locationMap.entrySet().iterator();


    while(iterator.hasNext()) {
      if(pool.getQueue().remainingCapacity()==0){
        Thread.sleep(10);
        continue;
      }
      Entry<String, TIntArrayList> entry = iterator.next();

      TIntArrayList al = entry.getValue();
      CalcSimilarity cs = new CalcSimilarity(al);
      pool.execute(cs);
    }
    while(pool.getActiveCount()>0){
      Thread.sleep(10);
    }
    pool.shutdown();
  }

  public static void main(String[] args) throws IOException, ClassNotFoundException {


  }
  public class CalcSimilarity implements Runnable {

   
    private TIntArrayList al;

    public CalcSimilarity() {

    }

    public CalcSimilarity(TIntArrayList al2) {
      this.al =al2;
    }

    @Override
    public void run() {
      for(int i = 0; i < al.size(); i++) {
        int a = al.get(i);

        for(int j = i + 1; j < al.size(); j++) {
          int b = al.get(j);
          double lenDiff;
          lenDiff = Math.abs(featureLen[a]-featureLen[b]);
          lenDiff /=Math.max(featureLen[a],featureLen[b]);
          if(lenDiff>lenDiffThresh)
            continue;
         
          int ids,idl;
          if(a<=b){
            ids = a;
            idl=b;
          }else{
            ids = b;
            idl=a;
          }         
          if(similarityMap[ids] == null)
            similarityMap[ids] = new TIntIntHashMap();
          synchronized (similarityMap[ids]) {           
            similarityMap[ids].adjustOrPutValue(idl, 1, 1);
          }

        }

      }
    }
  }
}
TOP

Related Classes of org.fnlp.nlp.duplicate.Similarity

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.