Examples of WikipediaPage


Examples of edu.umd.cloud9.collection.wikipedia.WikipediaPage

      }
    }

    public void map(Writable docnoKey, Indexable page, OutputCollector<PairOfInts, WikiDocInfo> output, Reporter reporter) throws IOException {
      int docno = ((IntWritable)docnoKey).get();
      WikipediaPage p = (WikipediaPage) page;
      String lang = p.getLanguage();
      ArrayListOfIntsWritable similarDocnos;

      // we only load the mapping once, during the first map() call of a mapper.
      // this works b/c all input kv pairs of a given mapper will have same lang id (reason explained above)
      if(pwsimMapping.isEmpty()){
        loadPairs(pwsimMapping, lang, mJob, reporter);
        sLogger.debug(pwsimMapping.size());
      }
     
      // if no similar docs for docno, return
      if(pwsimMapping.containsKey(docno)){
        similarDocnos = pwsimMapping.get(docno);  
      }else{
        return;
      }

      ArrayListWritable<Text> sentences;
      ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
      ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
      try {
        if(lang.equals("en")){
          // identify sentences in document, filter out ones below MinSentLength threshold
          // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
          // filter out sentences for which the vector has less than MinVectorTerms terms
          sentences = helper.getESentences(p.getContent(), vectors, sentLengths);   
         
        }else{
          sentences = helper.getFSentences(p.getContent(), vectors, sentLengths);
        }
        if(sentences.size() != vectors.size()) {
          throw new RuntimeException("Sentences.size != Vectors.size");
        }
      } catch (Exception e) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.