Package de.jungblut.math

Examples of de.jungblut.math.DoubleVector


      List<String[]> tokenizedDocuments, String[] dictionary) {

    List<DoubleVector> vectorList = new ArrayList<>(tokenizedDocuments.size());
    int oovIndex = Arrays.binarySearch(dictionary, OUT_OF_VOCABULARY);
    for (String[] arr : tokenizedDocuments) {
      DoubleVector vector = new SparseDoubleVector(dictionary.length);
      HashMultiset<String> set = HashMultiset.create(Arrays.asList(arr));
      for (String s : arr) {
        int foundIndex = Arrays.binarySearch(dictionary, s);
        // simply ignore tokens we don't know or that are spam
        if (foundIndex >= 0) {
          // the index is equal to its mapped dimension
          vector.set(foundIndex, set.count(s));
        } else if (oovIndex >= 0) {
          vector.set(oovIndex, 1);
        }
      }
      vectorList.add(vector);
    }
View Full Code Here


   */
  public static DoubleVector tfIdfVectorize(int numDocuments,
      String[] document, String[] dictionary, int[] termDocumentCount) {

    final int numTokens = dictionary.length;
    DoubleVector vector = new SparseDoubleVector(numTokens);
    HashMultiset<String> termFrequencySet = HashMultiset.create(Arrays
        .asList(document));

    int oovIndex = Arrays.binarySearch(dictionary, OUT_OF_VOCABULARY);
    double docLog = FastMath.log(numDocuments);

    for (String token : document) {
      int index = Arrays.binarySearch(dictionary, token);
      if (index >= 0) {
        double tfIdf = termFrequencySet.count(token)
            * (docLog - FastMath.log(termDocumentCount[index]));
        vector.set(index, tfIdf);
      } else {
        if (oovIndex >= 0) {
          vector.set(oovIndex, 1d);
        }
      }
    }
    return vector;
  }
View Full Code Here

   * @return the new n-dimensional dense vector vectorized via the hashing
   *         trick.
   */
  public static DoubleVector hashVectorize(DoubleVector inputFeature, int n,
      com.google.common.hash.HashFunction hashFunction) {
    DoubleVector dense = new DenseDoubleVector(n);
    Iterator<DoubleVectorElement> iterateNonZero = inputFeature
        .iterateNonZero();
    while (iterateNonZero.hasNext()) {
      DoubleVectorElement next = iterateNonZero.next();
      // get the hash as int
      int hash = hashFunction.hashInt(next.getIndex()).asInt();
      // abs it, as we don't want to have negative index access
      int bucket = Math.abs(hash) % n;
      // subtract 1 in case of negative values, else increment.
      // this replaces the second hash function proposed by Weinberger et al.
      dense.set(bucket, dense.get(bucket) + (hash < 0 ? -1d : 1d));
    }
    return dense;
  }
View Full Code Here

    for (int epoch = 0; epoch < epochs; epoch++) {
      double lossSum = 0d;
      int localItems = 0;
      for (Tuple<DoubleVector, DoubleVector> tuple : dataStream) {
        localItems++;
        DoubleVector feature = tuple.getFirst();
        DoubleVector outcome = tuple.getSecond();
        DoubleVector z1 = theta.multiplyVectorColumn(feature);
        DoubleVector activations = SIGMOID.apply(z1);
        double loss = LOSS.calculateError(
            new SparseDoubleRowMatrix(Arrays.asList(outcome)),
            new SparseDoubleRowMatrix(Arrays.asList(activations)));
        lossSum += loss;
        DoubleVector activationDifference = activations.subtract(outcome);
        // update theta by a smarter sparsity algorithm
        Iterator<DoubleVectorElement> featureIterator = feature
            .iterateNonZero();
        while (featureIterator.hasNext()) {
          DoubleVectorElement next = featureIterator.next();
          DoubleVector rowVector = theta.getRowVector(next.getIndex());
          double l2 = rowVector.pow(2d).sum();
          Iterator<DoubleVectorElement> diffIterator = activationDifference
              .iterateNonZero();
          while (diffIterator.hasNext()) {
            DoubleVectorElement diffElement = diffIterator.next();
            double val = rowVector.get(diffElement.getIndex());
            if (val != 0) {
              val = val - diffElement.getValue() * alpha;
              // apply the decay
              if (lambda != 0d) {
                val -= ((lambda * val) + (1d - lambda) * l2);
              }
              if (Math.abs(val) < nearZeroLimit) {
                val = 0;
              }
              rowVector.set(diffElement.getIndex(), val);
            }
          }
        }
        if (verbose && localItems % reportInterval == 0) {
          System.out.format(" Item %d | AVG Loss: %f\r", localItems,
View Full Code Here

      clz = 0;
    }
    if (binaryClassification) {
      return new DenseDoubleVector(new double[] { clz });
    } else {
      DoubleVector vec = outcomeDimension > 10 ? new SparseDoubleVector(
          outcomeDimension) : new DenseDoubleVector(outcomeDimension);
      vec.set(clz, 1);
      return vec;
    }
  }
View Full Code Here

    List<DoubleVector> newOutcomes = Lists.newLinkedList();

    Iterator<DoubleVector> featureIterator = features.iterator();
    Iterator<DoubleVector> outcomeIterator = outcome.iterator();
    while (featureIterator.hasNext()) {
      DoubleVector feature = featureIterator.next();
      DoubleVector out = outcomeIterator.next();
      if (((int) feature.get(bestSplitIndex)) == nominalValue) {
        newFeatures.add(feature);
        newOutcomes.add(out);
      }
    }
View Full Code Here

    List<DoubleVector> newOutcomes = Lists.newLinkedList();

    Iterator<DoubleVector> featureIterator = features.iterator();
    Iterator<DoubleVector> outcomeIterator = outcome.iterator();
    while (featureIterator.hasNext()) {
      DoubleVector feature = featureIterator.next();
      DoubleVector out = outcomeIterator.next();
      if (lower) {
        if (feature.get(bestSplitIndex) <= splitValue) {
          newFeatures.add(feature);
          newOutcomes.add(out);
        }
View Full Code Here

  private void initWeights(
      Iterable<Tuple<DoubleVector, DoubleVector>> dataStream, DoubleMatrix theta) {
    for (Tuple<DoubleVector, DoubleVector> tuple : dataStream) {
      // randomly initialize our weight matrix by the data we have seen
      DoubleVector feature = tuple.getFirst();
      DoubleVector outcome = tuple.getSecond();
      Iterator<DoubleVectorElement> featureIterator = feature.iterateNonZero();
      while (featureIterator.hasNext()) {
        DoubleVectorElement feat = featureIterator.next();
        Iterator<DoubleVectorElement> outcomeIterator = outcome
            .iterateNonZero();
        while (outcomeIterator.hasNext()) {
          DoubleVectorElement out = outcomeIterator.next();
          theta.set(feat.getIndex(), out.getIndex(), random.nextDouble());
        }
View Full Code Here

      TIntIntHashMap rowSums = new TIntIntHashMap();
      int numFeatures = 0;
      Iterator<DoubleVector> featureIterator = features.iterator();
      Iterator<DoubleVector> outcomeIterator = outcome.iterator();
      while (featureIterator.hasNext()) {
        DoubleVector feature = featureIterator.next();
        DoubleVector out = outcomeIterator.next();
        int classIndex = getOutcomeClassIndex(out);
        int nominalFeatureValue = (int) feature.get(featureIndex);
        int[] is = featureValueOutcomeCount.get(nominalFeatureValue);
        if (is == null) {
          is = new int[outcomeDimension];
          featureValueOutcomeCount.put(nominalFeatureValue, is);
        }
        is[classIndex]++;
        rowSums.put(nominalFeatureValue, rowSums.get(nominalFeatureValue) + 1);
        numFeatures++;
      }
      double entropySum = 0d;
      // now we can calculate the entropy
      TIntObjectIterator<int[]> iterator = featureValueOutcomeCount.iterator();
      while (iterator.hasNext()) {
        iterator.advance();
        int[] outcomeCounts = iterator.value();
        double condEntropy = rowSums.get(iterator.key()) / (double) numFeatures
            * getEntropy(outcomeCounts);
        entropySum += condEntropy;
      }
      return new Split(featureIndex, overallEntropy - entropySum);
    } else {
      // numerical case
      Iterator<DoubleVector> featureIterator = features.iterator();
      TDoubleHashSet possibleFeatureValues = new TDoubleHashSet();
      while (featureIterator.hasNext()) {
        DoubleVector feature = featureIterator.next();
        possibleFeatureValues.add(feature.get(featureIndex));
      }
      double bestInfogain = -1;
      double bestSplit = 0.0;
      TDoubleIterator iterator = possibleFeatureValues.iterator();
      while (iterator.hasNext()) {
View Full Code Here

    int highCount = 0;
    Arrays.fill(counts, new int[outcomeDimension]);
    Iterator<DoubleVector> featureIterator = features.iterator();
    Iterator<DoubleVector> outcomeIterator = outcome.iterator();
    while (featureIterator.hasNext()) {
      DoubleVector feature = featureIterator.next();
      DoubleVector out = outcomeIterator.next();
      int idx = getOutcomeClassIndex(out);
      if (feature.get(featureIndex) > value) {
        counts[1][idx]++;
        highCount++;
      } else {
View Full Code Here

TOP

Related Classes of de.jungblut.math.DoubleVector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.