/*
* Copyright Myrrix Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.myrrix.common.random;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.List;
import com.google.common.base.Preconditions;
import org.apache.commons.math3.primes.Primes;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.util.FastMath;
import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
import net.myrrix.common.LangUtils;
import net.myrrix.common.collection.FastIDSet;
import net.myrrix.common.collection.SamplingLongPrimitiveIterator;
import net.myrrix.common.math.SimpleVectorMath;
/**
* Helpful methods related to randomness and related functions. Some parts derived from Mahout.
*
* @author Sean Owen
* @author Mahout
* @since 1.0
*/
public final class RandomUtils {
/** The largest prime less than 2<sup>31</sup>-1 that is the smaller of a twin prime pair. */
public static final int MAX_INT_SMALLER_TWIN_PRIME = 2147482949;
private static final MessageDigest MD5_DIGEST;
static {
try {
MD5_DIGEST = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
// Can't happen
throw new IllegalStateException(e);
}
}
private RandomUtils() {
}
/**
* @param dimensions dimensionality of resulting vector
* @param random random number generator to use
* @return a vector of length 1 over the given number of dimensions, whose direction is chosen uniformly
* at random (that is: a point chosen uniformly at random on the unit hypersphere)
*/
public static float[] randomUnitVector(int dimensions, RandomGenerator random) {
float[] vector = new float[dimensions];
doRandomUnitVector(vector, random);
return vector;
}
private static void doRandomUnitVector(float[] vector, RandomGenerator random) {
int dimensions = vector.length;
double total = 0.0;
for (int i = 0; i < dimensions; i++) {
double d = random.nextGaussian();
vector[i] = (float) d;
total += d * d;
}
float normalization = (float) FastMath.sqrt(total);
for (int i = 0; i < dimensions; i++) {
vector[i] /= normalization;
}
}
/**
* @param dimensions dimensionality of resulting vector
* @param farFrom vectors that the chosen vector should be "far from" -- not in the same direction as
* @param random random number generator to use
* @return a vector of length 1 over the given number of dimensions, whose direction is chosen uniformly
* at random (that is: a point chosen uniformly at random on the unit hypersphere), but preferring
* those not in the same direction as a set of existing vectors
*/
public static float[] randomUnitVectorFarFrom(int dimensions,
List<float[]> farFrom,
RandomGenerator random) {
int size = farFrom.size();
int numSamples = FastMath.min(100, size);
float[] vector = new float[dimensions];
boolean accepted = false;
while (!accepted) {
doRandomUnitVector(vector, random);
double smallestDistSquared = Double.POSITIVE_INFINITY;
for (int sample = 0; sample < numSamples; sample++) {
float[] other = farFrom.get(size == numSamples ? sample : random.nextInt(size));
// dot is the cosine here since both are unit vectors
double distSquared = 2.0 - 2.0 * SimpleVectorMath.dot(vector, other);
if (LangUtils.isFinite(distSquared) && distSquared < smallestDistSquared) {
smallestDistSquared = distSquared;
}
}
// Second condition covers 1-D case, where there are only 2 distinct unit vectors. If both have
// been generated, keep accepting either of them.
if (LangUtils.isFinite(smallestDistSquared) && !(dimensions == 1 && smallestDistSquared == 0.0)) {
// Choose with probability proportional to squared distance, a la kmeans++ centroid selection
double acceptProbability = smallestDistSquared / 4.0; // dist squared is in [0,4]
accepted = random.nextDouble() < acceptProbability;
} else {
// kind of a default
accepted = true;
}
}
return vector;
}
/**
* Finds next-largest "twin primes": numbers p and p+2 such that both are prime. Finds the smallest such p
* such that the smaller twin, p, is greater than or equal to n. Returns p+2, the larger of the two twins.
*/
public static int nextTwinPrime(int n) {
if (n > MAX_INT_SMALLER_TWIN_PRIME) {
throw new IllegalArgumentException();
}
if (n <= 3) {
return 5;
}
int next = Primes.nextPrime(n);
while (!Primes.isPrime(next + 2)) {
next = Primes.nextPrime(next + 4);
}
return next + 2;
}
/**
* @param l long to MD5 hash
* @return the bottom 8 bytes, as a {@code long}, of the MD5 hash of the given {@code long},
* which is itself treated as a big-endian sequence of 8 bytes
*/
public static long md5HashToLong(long l) {
byte[] hash;
synchronized (MD5_DIGEST) {
for (int i = 0; i < 8; i++) {
MD5_DIGEST.update((byte) l);
l >>= 8;
}
hash = MD5_DIGEST.digest();
}
long result = 0L;
// Use bottom 8 bytes
for (int i = 8; i < 16; i++) {
result = (result << 4) | (hash[i] & 0xFFL);
}
return result;
}
/**
* @param set to choose from
* @param random random number generator
* @return element of the set chosen uniformly at random
*/
public static int randomFrom(FastIDSet set, RandomGenerator random) {
int size = set.size();
Preconditions.checkArgument(size > 0, "Empty set");
LongPrimitiveIterator it = set.iterator();
it.skip(random.nextInt(size));
return (int) it.nextLong();
}
/**
* @param n approximate number of items to choose
* @param stream stream to choose from randomly
* @param streamSize (approximate) stream size
* @param random random number generator
* @return up to n elements chosen uninformly at random from the stream
*/
public static long[] chooseAboutNFromStream(int n,
LongPrimitiveIterator stream,
int streamSize,
RandomGenerator random) {
LongPrimitiveIterator it;
if (n < streamSize) {
it = new SamplingLongPrimitiveIterator(random, stream, (double) n / streamSize);
} else {
it = stream;
}
FastIDSet chosen = new FastIDSet(n);
while (it.hasNext()) {
chosen.add(it.nextLong());
}
return chosen.toArray();
}
}