Examples of SequenceFileDirectoryReader

com.mozilla.hadoop.fs.SequenceFileDirectoryReader

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader

    }
    
    public static Map<Integer, PriorityQueue<Pair<Double,String>>> getDocIds(Path docTopicsPath, int numDocs) {
        Map<Integer, PriorityQueue<Pair<Double,String>>> docIdMap = new HashMap<Integer, PriorityQueue<Pair<Double,String>>>();
        Map<Integer, Double> maxDocScores = new HashMap<Integer,Double>();
        SequenceFileDirectoryReader pointsReader = null;
        try {
            Text k = new Text();
            VectorWritable vw = new VectorWritable();
            pointsReader = new SequenceFileDirectoryReader(docTopicsPath);
            while (pointsReader.next(k, vw)) {
                String docId = k.toString();
                Vector normGamma = vw.get();
                Iterator<Element> iter = normGamma.iterateNonZero();
                double maxTopicScore = 0.0;
                int idx = 0;
                int topic = 0;
                while(iter.hasNext()) {
                    Element e = iter.next();
                    double score = e.get();
                    if (score > maxTopicScore) {
                        maxTopicScore = score;
                        topic = idx;
                    }
                    
                    idx++;  
                }
                
                PriorityQueue<Pair<Double,String>> docIdsForTopic = docIdMap.get(topic);
                if (docIdsForTopic == null) {
                    docIdsForTopic = new PriorityQueue<Pair<Double,String>>(numDocs);
                }
                
                Double maxDocScoreForTopic = maxDocScores.get(topic);
                if (maxDocScoreForTopic == null) {
                    maxDocScoreForTopic = 0.0;
                }
                if (maxTopicScore > maxDocScoreForTopic) {
                    maxDocScores.put(topic, maxTopicScore);
                }
                
                enqueue(docIdsForTopic, docId, maxTopicScore, numDocs);
                docIdMap.put(topic, docIdsForTopic);
            }
        } catch (IOException e) {
            LOG.error("IOException caught while reading clustered points", e);
        } finally {
            if (pointsReader != null) {
                pointsReader.close();
            }
        }


        for (Map.Entry<Integer, Double> entry : maxDocScores.entrySet()) {
            System.out.println("For topic: " + entry.getKey() + " max score: " + entry.getValue());

View Full Code Here

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader

    }
    
    public static Map<Integer,PriorityQueue<Pair<Double,String>>> getTopWordsByTopic(String stateDirPath, Map<Integer,String> featureIndex, int numWordsToPrint) {
        Map<Integer,Double> expSums = new HashMap<Integer, Double>();
        Map<Integer,PriorityQueue<Pair<Double,String>>> queues = new HashMap<Integer,PriorityQueue<Pair<Double,String>>>();
        SequenceFileDirectoryReader reader = null;
        try {
            IntPairWritable k = new IntPairWritable();
            DoubleWritable v = new DoubleWritable();
            reader = new SequenceFileDirectoryReader(new Path(stateDirPath));
            while (reader.next(k, v)) {
                int topic = k.getFirst();
                int featureId = k.getSecond();
                if (featureId >= 0 && topic >= 0) {
                    double score = v.get();
                    Double curSum = expSums.get(topic);
                    if (curSum == null) {
                        curSum = 0.0;
                    }
                    expSums.put(topic, curSum + Math.exp(score));
                    String feature = featureIndex.get(featureId);
                    
                    PriorityQueue<Pair<Double,String>> q = queues.get(topic);
                    if (q == null) {
                        q = new PriorityQueue<Pair<Double,String>>(numWordsToPrint);
                    }
                    enqueue(q, feature, score, numWordsToPrint);
                    queues.put(topic, q);
                }
            }
        } catch (IOException e) {
            LOG.error("Error reading LDA state dir", e);
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
        
        for (Map.Entry<Integer, PriorityQueue<Pair<Double,String>>> entry : queues.entrySet()) {
            int topic = entry.getKey();

View Full Code Here

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader

    }
    
    public static Map<Integer, PriorityQueue<Pair<Double,String>>> getTopDocIdsByTopic(Path docTopicsPath, int numDocs) {
        Map<Integer, PriorityQueue<Pair<Double,String>>> docIdMap = new HashMap<Integer, PriorityQueue<Pair<Double,String>>>();
        Map<Integer, Double> maxDocScores = new HashMap<Integer,Double>();
        SequenceFileDirectoryReader pointsReader = null;
        try {
            Text k = new Text();
            VectorWritable vw = new VectorWritable();
            pointsReader = new SequenceFileDirectoryReader(docTopicsPath);
            while (pointsReader.next(k, vw)) {
                String docId = k.toString();
                Vector normGamma = vw.get();
                Iterator<Element> iter = normGamma.iterateNonZero();
                double maxTopicScore = 0.0;
                int idx = 0;
                int topic = 0;
                while(iter.hasNext()) {
                    Element e = iter.next();
                    double score = e.get();
                    if (score > maxTopicScore) {
                        maxTopicScore = score;
                        topic = idx;
                    }
                    
                    idx++;  
                }
                
                PriorityQueue<Pair<Double,String>> docIdsForTopic = docIdMap.get(topic);
                if (docIdsForTopic == null) {
                    docIdsForTopic = new PriorityQueue<Pair<Double,String>>(numDocs);
                }
                
                Double maxDocScoreForTopic = maxDocScores.get(topic);
                if (maxDocScoreForTopic == null) {
                    maxDocScoreForTopic = 0.0;
                }
                if (maxTopicScore > maxDocScoreForTopic) {
                    maxDocScores.put(topic, maxTopicScore);
                }
                
                enqueue(docIdsForTopic, docId, maxTopicScore, numDocs);
                docIdMap.put(topic, docIdsForTopic);
            }
        } catch (IOException e) {
            LOG.error("IOException caught while reading clustered points", e);
        } finally {
            if (pointsReader != null) {
                pointsReader.close();
            }
        }


        for (Map.Entry<Integer, Double> entry : maxDocScores.entrySet()) {
            System.out.println("For topic: " + entry.getKey() + " max score: " + entry.getValue());

View Full Code Here

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader

    }
    
    public static Map<Integer,PriorityQueue<Pair<Double,String>>> getTopWordsByTopics(String stateDirPath, Map<Integer,String> featureIndex, int numWordsToPrint) {
        Map<Integer,Double> expSums = new HashMap<Integer, Double>();
        Map<Integer,PriorityQueue<Pair<Double,String>>> queues = new HashMap<Integer,PriorityQueue<Pair<Double,String>>>();
        SequenceFileDirectoryReader reader = null;
        try {
            IntPairWritable k = new IntPairWritable();
            DoubleWritable v = new DoubleWritable();
            reader = new SequenceFileDirectoryReader(new Path(stateDirPath));
            while (reader.next(k, v)) {
                int topic = k.getFirst();
                int featureId = k.getSecond();
                if (featureId >= 0 && topic >= 0) {
                    double score = v.get();
                    Double curSum = expSums.get(topic);
                    if (curSum == null) {
                        curSum = 0.0;
                    }
                    expSums.put(topic, curSum + Math.exp(score));
                    String feature = featureIndex.get(featureId);
                    
                    PriorityQueue<Pair<Double,String>> q = queues.get(topic);
                    if (q == null) {
                        q = new PriorityQueue<Pair<Double,String>>(numWordsToPrint);
                    }
                    enqueue(q, feature, score, numWordsToPrint);
                    queues.put(topic, q);
                }
            }
        } catch (IOException e) {
            LOG.error("Error reading LDA state dir", e);
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
        
        for (Map.Entry<Integer, PriorityQueue<Pair<Double,String>>> entry : queues.entrySet()) {
            int topic = entry.getKey();

View Full Code Here

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader

    }
    
    public Map<Integer,Set<String>> getDocIds(double sampleRate) {
        Random rand = new Random();
        Map<Integer,Set<String>> docIdMap = new HashMap<Integer,Set<String>>();
        SequenceFileDirectoryReader pointsReader = null;
        try {
            IntWritable k = new IntWritable();
            WeightedVectorWritable wvw = new WeightedVectorWritable();
            pointsReader = new SequenceFileDirectoryReader(clusteredPointsPath);
            while (pointsReader.next(k, wvw)) {
                int clusterId = k.get();                
                Vector v = wvw.getVector();
                if (v instanceof NamedVector) {
                    if (rand.nextDouble() < sampleRate) {
                        NamedVector nv = (NamedVector)v;
                        nv.getName();
                        Set<String> curDocIds = docIdMap.get(clusterId);
                        if (curDocIds == null) {
                            curDocIds = new HashSet<String>();
                        }
                        curDocIds.add(nv.getName());
                        docIdMap.put(clusterId, curDocIds);
                    }
                }
            }
        } catch (IOException e) {
            LOG.error("IOException caught while reading clustered points", e);
        } finally {
            if (pointsReader != null) {
                pointsReader.close();
            }
        }
        
        return docIdMap;
    }

View Full Code Here

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader

        this.invertedFeatureIndex = Dictionary.loadInvertedFeatureIndex(dictionaryPath);
    }
    
    public Map<Integer,Cloud> getClouds(Cloud template) {
        Map<Integer,Cloud> cloudMap = new HashMap<Integer,Cloud>();
        SequenceFileDirectoryReader pointsReader = null;
        try {
            IntWritable k = new IntWritable();
            WeightedVectorWritable wvw = new WeightedVectorWritable();
            pointsReader = new SequenceFileDirectoryReader(clusteredPointsPath);
            while (pointsReader.next(k, wvw)) {
                int clusterId = k.get();
                Cloud c = cloudMap.get(clusterId);
                if (c == null) {
                    c = new Cloud(template);
                }
                Iterator<Element> viter = wvw.getVector().iterateNonZero();
                while (viter.hasNext()) {
                    Element e = viter.next();
                    String feature = invertedFeatureIndex.get(e.index());
                    c.addTag(new Tag(feature, e.get()));
                }
                
                cloudMap.put(clusterId, c);
            }
        } catch (IOException e) {
            LOG.error("IOException caught while reading clustered points", e);
        } finally {
            if (pointsReader != null) {
                pointsReader.close();
            }
        }
        
        return cloudMap;
    }

View Full Code Here

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader


    private static final Logger LOG = Logger.getLogger(DisplayKMeansBase.class);
    
    public List<Pair<Integer,Vector>> readClusteredPoints(Path clusteredPointsPath) {
        List<Pair<Integer,Vector>> clusteredPoints = new ArrayList<Pair<Integer,Vector>>();
        SequenceFileDirectoryReader pointsReader = null;
        try {
            IntWritable k = new IntWritable();
            WeightedVectorWritable wvw = new WeightedVectorWritable();
            pointsReader = new SequenceFileDirectoryReader(clusteredPointsPath);
            while (pointsReader.next(k, wvw)) {                
                clusteredPoints.add(new Pair<Integer,Vector>(k.get(), wvw.getVector()));
            }
        } catch (IOException e) {
            LOG.error("IOException caught while reading clustered points", e);
        } finally {
            if (pointsReader != null) {
                pointsReader.close();
            }
        }
        
        return clusteredPoints;
    }

View Full Code Here

Examples of com.mozilla.hadoop.fs.SequenceFileDirectoryReader

        return clusteredPoints;
    }
    
    public List<Cluster> readClustersIteration(Path clusterIterationPath) {
        List<Cluster> clusters = new ArrayList<Cluster>();
        SequenceFileDirectoryReader iterationReader = null;
        try {
            Text k = new Text();
            Cluster c = new Cluster();
            iterationReader = new SequenceFileDirectoryReader(clusterIterationPath);
            while (iterationReader.next(k, c)) {
                clusters.add(c);
            }
        } catch (IOException e) {
            LOG.error("IOException caught while reading clustered points", e);
        } finally {
            if (iterationReader != null) {
                iterationReader.close();
            }
        }


        return clusters;
    }

View Full Code Here

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.