Package ivory.ffg.data

Source Code of ivory.ffg.data.DocumentVectorUtility

package ivory.ffg.data;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.IOException;
import java.util.List;

import com.google.common.collect.Lists;

import com.kamikaze.pfordelta.PForDelta;

import ivory.core.compression.BitInputStream;
import ivory.core.compression.BitOutputStream;
import ivory.core.data.document.IntDocVector;
import ivory.core.data.index.TermPositions;

/**
* Auxiliary functions
*
* @author Nima Asadi
*/
public class DocumentVectorUtility {
  public static final int BLOCK_SIZE = 128;
  public static final int MAX_POSITIONS = 100;
  private static final int[] TEMP_POSITIONS = new int[MAX_POSITIONS];

  /**
   * Given a document vector and an array of query terms, this function
   * constructs the positions.
   *
   * @param doc Document vector
   * @param terms Query terms
   * @return Position array for every query term
   */
  public static int[][] getPositions(int[] doc, int[] terms) {
    int[][] positions = new int[terms.length][];
    int pindex = 0;

    for(int i = 0; i < terms.length; i++) {
      pindex = 0;
      for(int j = 0; j < doc.length && pindex < TEMP_POSITIONS.length; j++) {
        if(doc[j] == terms[i]) {
          TEMP_POSITIONS[pindex++] = j + 1;
        }
      }
      positions[i] = new int[pindex];
      for(int j = 0; j < positions[i].length; j++) {
        positions[i][j] = TEMP_POSITIONS[j];
      }
    }

    return positions;
  }

  /**
   * Serializes the positions using gamma codes
   *
   * @param positions Array of positions for a term
   * @return Serialized positions (using gamma codes)
   */
  public static byte[] serializePositions(int[] positions) throws IOException {
    ByteArrayOutputStream b = new ByteArrayOutputStream();
    BitOutputStream t = new BitOutputStream(b);

    t.writeGamma(positions.length);

    for (int i = 0; i < positions.length; i++) {
      if (i == 0) {
        t.writeGamma(positions[0] + 1);
      } else {
        int pgap = positions[i] - positions[i - 1];
        t.writeGamma(pgap);
      }
    }

    t.padAndFlush();
    t.close();

    return b.toByteArray();
  }

  /**
   * Deserializes the gamma-encoded positions.
   *
   * @param bytes Serialized positions
   * @return A decoded integer array of positions
   */
  public static int[] deserializePositions(byte[] bytes) throws IOException {
    ByteArrayInputStream byteStream = new ByteArrayInputStream(bytes);
    BitInputStream bitStream = new BitInputStream(byteStream);

    int[] positions = new int[bitStream.readGamma()];
    for(int i = 0; i < positions.length; i++) {
      if (i == 0) {
        positions[i] = bitStream.readGamma() - 1;
      } else {
        positions[i] = (positions[i - 1] + bitStream.readGamma());
      }
    }

    bitStream.close();

    return positions;
  }

  /**
   * Compresses positions using PForDelta compression
   *
   * @param positions Array of positions for a term
   * @return Serialized positions (using PForDelta)
   */
  public static int[][] compressData(int[] data, int blockSize, boolean computeGaps) {
    // Data is stored in blocks of equal size..
    int nbBlocks = (int) Math.ceil(((double) data.length) / ((double) blockSize));
    int[][] compressedBlocks = new int[nbBlocks][];

    int[] temp = new int[blockSize];

    // Compress all blocks except for the last block which might
    // contain fewer elements.
    for(int i = 0; i < nbBlocks - 1; i++) {
      if(!computeGaps) {
        for(int j = 0; j < temp.length; j++) {
          temp[j] = data[i * blockSize + j];
        }
      } else {
        temp[0] = data[i * blockSize];
        int pre = temp[0];
        for(int j = 1; j < temp.length; j++) {
          temp[j] = data[i * blockSize + j] - pre;
          pre = data[i * blockSize + j];
        }
      }
      compressedBlocks[i] = PForDelta.compressOneBlockOpt(temp, blockSize);
    }

    // Compress the last block
    int remaining = lastBlockSize(data.length, nbBlocks, blockSize);
    temp = new int[remaining];
    if(!computeGaps) {
      for(int j = 0; j < temp.length; j++) {
        temp[j] = data[(nbBlocks - 1) * blockSize + j];
      }
    } else {
      temp[0] = data[(nbBlocks - 1) * blockSize];
      int pre = temp[0];
      for(int j = 1; j < temp.length; j++) {
        temp[j] = data[(nbBlocks - 1) * blockSize + j] - pre;
        pre = data[(nbBlocks - 1) * blockSize + j];
      }
    }
    compressedBlocks[nbBlocks - 1] = PForDelta.compressOneBlockOpt(temp, remaining);

    return compressedBlocks;
  }

  public static int lastBlockSize(int dataLength, int nbBlocks, int blockSize) {
    return dataLength - ((nbBlocks - 1) * blockSize);
  }

  /**
   * Factory method
   *
   * @param documentVectorClass DocumentVector class
   * @param document IntDocVector (term positions start from 1)
   * @return New DocumentVector (term positions start from 1)
   */
  public static DocumentVector newInstance(String documentVectorClass, IntDocVector document)
    throws Exception {
    IntDocVector.Reader r = document.getReader();

    if(documentVectorClass.equals(DocumentVectorMiniInvertedIndex.class.getName())) {
      List<Integer> termids = Lists.newArrayList();
      List<TermPositions> positions = Lists.newArrayList();

      int cnt = 0;
      while(r.hasMoreTerms()) {
        termids.add(r.nextTerm());
        int[] p = r.getPositions();
        positions.add(new TermPositions(p, r.getTf()));

        for(int j = 0; j < p.length; j++) {
          if(p[j] > cnt) {
            cnt = p[j];
          }
        }
      }

      int[] data = new int[termids.size()];
      for(int i = 0; i < termids.size(); i++) {
        data[i] = termids.get(i);
      }

      return DocumentVectorMiniInvertedIndex.newInstance(data, positions, cnt);
    }

    int cnt = 0;
    while(r.hasMoreTerms()) {
      r.nextTerm();
      int[] p = r.getPositions();
      for(int j = 0; j < p.length; j++) {
        if(p[j] > cnt) {
          cnt = p[j];
        }
      }
    }

    r = document.getReader();
    int[] data = new int[cnt];
    while(r.hasMoreTerms()) {
      int id = r.nextTerm();
      int[] p = r.getPositions();
      for(int j = 0; j < p.length; j++) {
        data[p[j] - 1] = id;
      }
    }

    if(documentVectorClass.equals(DocumentVectorHashedArray.class.getName())) {
      return DocumentVectorHashedArray.newInstance(data);
    } else if(documentVectorClass.equals(DocumentVectorPForDeltaArray.class.getName())) {
      return DocumentVectorPForDeltaArray.newInstance(data);
    } else if(documentVectorClass.equals(DocumentVectorVIntArray.class.getName())) {
      return DocumentVectorVIntArray.newInstance(data);
    } else {
      throw new ClassNotFoundException("DocumentVector " + documentVectorClass + " class not found!");
    }
  }

  /**
   * Reads an instance of DocumentVector from input
   *
   * @param input DataInput
   * @param documentVectorClass DocumentVector class
   * @return DocumentVector object
   */
  public static DocumentVector readInstance(String documentVectorClass, DataInput input) throws Exception {
    if(documentVectorClass.equals(DocumentVectorMiniInvertedIndex.class.getName())) {
      return DocumentVectorMiniInvertedIndex.readInstance(input);
    } else if(documentVectorClass.equals(DocumentVectorHashedArray.class.getName())) {
      return DocumentVectorHashedArray.readInstance(input);
    } else if(documentVectorClass.equals(DocumentVectorPForDeltaArray.class.getName())) {
      return DocumentVectorPForDeltaArray.readInstance(input);
    } else if(documentVectorClass.equals(DocumentVectorVIntArray.class.getName())) {
      return DocumentVectorVIntArray.readInstance(input);
    } else {
      throw new ClassNotFoundException("DocumentVector " + documentVectorClass + " class not found!");
    }
  }
}
TOP

Related Classes of ivory.ffg.data.DocumentVectorUtility

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.