Package org.eclipse.jgit.diff

Source Code of org.eclipse.jgit.diff.SimilarityIndex

/*
* Copyright (C) 2010, Google Inc.
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
*   notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
*   copyright notice, this list of conditions and the following
*   disclaimer in the documentation and/or other materials provided
*   with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
*   names of its contributors may be used to endorse or promote
*   products derived from this software without specific prior
*   written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package org.eclipse.jgit.diff;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectStream;

/**
* Index structure of lines/blocks in one file.
* <p>
* This structure can be used to compute an approximation of the similarity
* between two files. The index is used by {@link SimilarityRenameDetector} to
* compute scores between files.
* <p>
* To save space in memory, this index uses a space efficient encoding which
* will not exceed 1 MiB per instance. The index starts out at a smaller size
* (closer to 2 KiB), but may grow as more distinct blocks within the scanned
* file are discovered.
*/
class SimilarityIndex {
  /** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
  private static final int MAX_HASH_BITS = 17;

  /** The {@link #idHash} table will not grow bigger than this, ever. */
  private static final int MAX_HASH_SIZE = 1 << MAX_HASH_BITS;

  /** Prime just before {@link #MAX_HASH_SIZE}. */
  private static final int P = 131071;

  /**
   * Shift to apply before storing a key.
   * <p>
   * Within the 64 bit table record space, we leave the highest bit unset so
   * all values are positive, and we need {@link #MAX_HASH_BITS} bits for the
   * keys. The lower 32 bits are used to count bytes impacted.
   */
  private static final int KEY_SHIFT = 64 - 1 - MAX_HASH_BITS;

  /** Total size of the file we hashed into the structure. */
  private long fileSize;

  /** Number of non-zero entries in {@link #idHash}. */
  private int idSize;

  /**
   * Pairings of content keys and counters.
   * <p>
   * Slots in the table are actually two ints wedged into a single long. The
   * upper {@link #MAX_HASH_BITS} bits stores the content key, and the
   * remaining lower bits stores the number of bytes associated with that key.
   * Empty slots are denoted by 0, which cannot occur because the count cannot
   * be 0. Values can only be positive, which we enforce during key addition.
   */
  private long[] idHash;

  SimilarityIndex() {
    idHash = new long[256];
  }

  long getFileSize() {
    return fileSize;
  }

  void setFileSize(long size) {
    fileSize = size;
  }

  void hash(ObjectLoader obj) throws MissingObjectException, IOException {
    if (obj.isLarge()) {
      ObjectStream in = obj.openStream();
      try {
        setFileSize(in.getSize());
        hash(in, fileSize);
      } finally {
        in.close();
      }
    } else {
      byte[] raw = obj.getCachedBytes();
      setFileSize(raw.length);
      hash(raw, 0, raw.length);
    }
  }

  void hash(byte[] raw, int ptr, final int end) {
    while (ptr < end) {
      int hash = 5381;
      int start = ptr;

      // Hash one line, or one block, whichever occurs first.
      do {
        int c = raw[ptr++] & 0xff;
        if (c == '\n')
          break;
        hash = (hash << 5) ^ c;
      } while (ptr < end && ptr - start < 64);
      add(hash, ptr - start);
    }
  }

  void hash(InputStream in, long remaining) throws IOException {
    byte[] buf = new byte[4096];
    int ptr = 0;
    int cnt = 0;

    while (0 < remaining) {
      int hash = 5381;

      // Hash one line, or one block, whichever occurs first.
      int n = 0;
      do {
        if (ptr == cnt) {
          ptr = 0;
          cnt = in.read(buf, 0, buf.length);
          if (cnt <= 0)
            throw new EOFException();
        }

        n++;
        int c = buf[ptr++] & 0xff;
        if (c == '\n')
          break;
        hash = (hash << 5) ^ c;
      } while (n < 64 && n < remaining);
      add(hash, n);
      remaining -= n;
    }
  }

  /**
   * Sort the internal table so it can be used for efficient scoring.
   * <p>
   * Once sorted, additional lines/blocks cannot be added to the index.
   */
  void sort() {
    // Sort the array. All of the empty space will wind up at the front,
    // because we forced all of the keys to always be positive. Later
    // we only work with the back half of the array.
    //
    Arrays.sort(idHash);
  }

  int score(SimilarityIndex dst, int maxScore) {
    long max = Math.max(fileSize, dst.fileSize);
    if (max == 0)
      return maxScore;
    return (int) ((common(dst) * maxScore) / max);
  }

  int common(SimilarityIndex dst) {
    return common(this, dst);
  }

  private static int common(SimilarityIndex src, SimilarityIndex dst) {
    int srcIdx = src.packedIndex(0);
    int dstIdx = dst.packedIndex(0);
    long[] srcHash = src.idHash;
    long[] dstHash = dst.idHash;
    return common(srcHash, srcIdx, dstHash, dstIdx);
  }

  private static int common(long[] srcHash, int srcIdx, //
      long[] dstHash, int dstIdx) {
    if (srcIdx == srcHash.length || dstIdx == dstHash.length)
      return 0;

    int common = 0;
    int srcKey = keyOf(srcHash[srcIdx]);
    int dstKey = keyOf(dstHash[dstIdx]);

    for (;;) {
      if (srcKey == dstKey) {
        common += Math.min(countOf(srcHash[srcIdx]),
            countOf(dstHash[dstIdx]));

        if (++srcIdx == srcHash.length)
          break;
        srcKey = keyOf(srcHash[srcIdx]);

        if (++dstIdx == dstHash.length)
          break;
        dstKey = keyOf(dstHash[dstIdx]);

      } else if (srcKey < dstKey) {
        // Regions of src which do not appear in dst.
        if (++srcIdx == srcHash.length)
          break;
        srcKey = keyOf(srcHash[srcIdx]);

      } else /* if (srcKey > dstKey) */{
        // Regions of dst which do not appear in dst.
        if (++dstIdx == dstHash.length)
          break;
        dstKey = keyOf(dstHash[dstIdx]);
      }
    }

    return common;
  }

  // Testing only
  int size() {
    return idSize;
  }

  // Testing only
  int key(int idx) {
    return keyOf(idHash[packedIndex(idx)]);
  }

  // Testing only
  long count(int idx) {
    return countOf(idHash[packedIndex(idx)]);
  }

  // Brute force approach only for testing.
  int findIndex(int key) {
    for (int i = 0; i < idSize; i++)
      if (key(i) == key)
        return i;
    return -1;
  }

  private int packedIndex(int idx) {
    return (idHash.length - idSize) + idx;
  }

  void add(int key, int cnt) {
    key = hash(key);
    int j = slot(key);
    for (;;) {
      long v = idHash[j];
      if (v == 0) {
        // Empty slot in the table, store here.
        if (shouldGrow()) {
          grow();
          j = slot(key);
          continue;
        }
        idHash[j] = (((long) key) << KEY_SHIFT) | cnt;
        idSize++;
        return;

      } else if (keyOf(v) == key) {
        // Same key, increment the counter.
        idHash[j] = v + cnt;
        return;

      } else if (++j >= idHash.length) {
        j = 0;
      }
    }
  }

  private static int hash(int key) {
    // Make the key fit into our table. Since we have a maximum size
    // that we cap the table at, all keys get squashed before going
    // into the table. This prevents overflow.
    //
    return (key >>> 1) % P;
  }

  private int slot(int key) {
    return key % idHash.length;
  }

  private boolean shouldGrow() {
    int n = idHash.length;
    return n < MAX_HASH_SIZE && n <= idSize * 2;
  }

  private void grow() {
    long[] oldHash = idHash;
    int oldSize = idHash.length;

    idHash = new long[2 * oldSize];
    for (int i = 0; i < oldSize; i++) {
      long v = oldHash[i];
      if (v != 0) {
        int j = slot(keyOf(v));
        while (idHash[j] != 0)
          if (++j >= idHash.length)
            j = 0;
        idHash[j] = v;
      }
    }
  }

  private static int keyOf(long v) {
    return (int) (v >>> KEY_SHIFT);
  }

  private static int countOf(long v) {
    return (int) v;
  }
}
TOP

Related Classes of org.eclipse.jgit.diff.SimilarityIndex

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.