Package org.wikipedia.miner.model

Examples of org.wikipedia.miner.model.Article


   */
  private double[] computeFeatureValues(Candidate candidate,
      boolean training, HashMap<String, Counter> hashKeyphrases,
      HashMap<String, Candidate> candidates) {

    Article candidateArticle = candidate.getArticle();

    // Compute feature values
    double[] newInst = new double[numFeatures + 1];

    String name = candidate.getName();
    String original = candidate.getBestFullForm();
    String title = candidate.getTitle();

    // Compute TFxIDF
    Counter counterGlobal = (Counter) globalDictionary.get(name);
    double globalVal = 0;
    if (counterGlobal != null) {
      globalVal = counterGlobal.value();
      if (training) {
        globalVal = globalVal - 1;
      }
    }
    double tf = candidate.getTermFrequency();
    double idf = -Math.log((globalVal + 1) / ((double) numDocs + 1));
    // System.out.println(candidate + " count: " + candidate.getFrequency() + "
    // tf: " + tf + " glob val: " + globalVal + " numDocs: " + numDocs + " idf:
    // " + idf);

    if (useBasicFeatures) {
      newInst[tfidfIndex] = tf * idf;
      newInst[firstOccurIndex] = candidate.getFirstOccurrence();
    }

    if (useFrequencyFeatures) {
      newInst[tfIndex] = tf;
      newInst[idfIndex] = idf;
    }

    if (usePositionsFeatures) {
      newInst[lastOccurIndex] = candidate.getLastOccurrence();
      newInst[spreadOccurIndex] = candidate.getSpread();
    }

    if (useKeyphrasenessFeature) {
      if (vocabularyName.equals("wikipedia")) {
        name = title;
      }
      Counter domainKeyphr = keyphraseDictionary.get(name);
      if ((training) && (hashKeyphrases != null)
          && (hashKeyphrases.containsKey(name))) {
        newInst[domainKeyphIndex] = domainKeyphr.value() - 1;
      } else {
        if (domainKeyphr != null) {
          newInst[domainKeyphIndex] = domainKeyphr.value();
        } else {
          newInst[domainKeyphIndex] = 0;
        }
      }
    }

    if (useLengthFeature) {

      if (original == null) {
        System.err.println("Warning! Problem with candidate " + name);
        newInst[lengthIndex] = 1.0;
      } else {
        String[] words = original.split(" ");
        newInst[lengthIndex] = (double) words.length;
      }
    }

    if (useNodeDegreeFeature) {
      int nodeDegree = 0;
      if (vocabularyName.equals("wikipedia")) {
        try {
          for (int relatedID : candidateArticle.getLinksInIds()) {
            if (candidates.containsKey(relatedID + "")) {
              nodeDegree++;
            }
          }
          for (int relatedID : candidateArticle.getLinksOutIds()) {
            if (candidates.containsKey(relatedID + "")) {
              nodeDegree++;
            }
          }
        } catch (SQLException e) {
          System.err.println("Error retrieving ids for candidate "+ candidate);
        }
      } else if (vocabulary != null) {

        Vector<String> relatedTerms = vocabulary.getRelated(name);

        if (relatedTerms != null) {
          for (String relatedTerm : relatedTerms) {
            if (candidates.get(relatedTerm) != null)
              nodeDegree++;
          }
        }

      }
    //  if (nodeDegree != 0) {
    //    System.out.println(candidate + " has node degree " + nodeDegree);
    //  }
      newInst[nodeDegreeIndex] = (double) nodeDegree;
    }

    if (useBasicWikipediaFeatures && wikipedia != null) {


      double wikipKeyphraseness = 0;
      if (vocabularyName.equals("wikipedia")) {
        wikipKeyphraseness = candidate.getWikipKeyphraseness()
      } else {
        Anchor anchor = null;
        try {
          anchor = new Anchor(wikipedia.getDatabase()
              .addEscapes(original), null, wikipedia.getDatabase());
          if (anchor != null) {
            if (anchor.getLinkProbability() != 0) {
              wikipKeyphraseness = anchor.getLinkProbability();
            }
          }
        } catch (SQLException e) {
          System.err.println("Error retrieving the anchor for " + candidate);
        //  e.printStackTrace();
        }
      }
      newInst[wikipKeyphrIndex] = wikipKeyphraseness; 
      newInst[totalWikipKeyphrIndex] = candidate.getTotalWikipKeyphraseness();

    //  System.out.println(candidate + "\t wikip Keyphr " + newInst[wikipKeyphrIndex] + "\t total wikip Keyphr " + newInst[totalWikipKeyphrIndex]);

    }

    if (useAllWikipediaFeatures) {

      if (candidateArticle == null) {
        try {
            candidateArticle = wikipedia.getMostLikelyArticle(original,
                new CaseFolder());
         
        } catch (SQLException e) {
          e.printStackTrace();
        }
      }
     
        double wikipFrequency = 0;
        double generality = 0;
        double semRelatedness = 0;

        if (candidateArticle != null) {
          try {
            double pageCount = candidateArticle.getLinksInCount();
            wikipFrequency = -Math.log(pageCount / 2000000);
            generality = candidateArticle.getGenerality();
          } catch (SQLException e) {
            e.printStackTrace();
          }
        }

        if (vocabularyName.equals("wikipedia") && candidateArticle != null) {
          for (Candidate c : candidates.values()) {
            if (!c.equals(candidate)) {
                double relatedness = 0;
                Article article = c.getArticle();
                try {
                  relatedness = candidateArticle.getRelatednessTo(article);
                 
                } catch (SQLException e) {
                  e.printStackTrace();
View Full Code Here


    // if not enough context was collected
    if (context.size() < maxContextSize) {
      // fill up context anchors with most likely mappings
      for (int i = 0; i < bestCandidateSenses.size()
      && context.size() < maxContextSize; i++) {
        Article sense = bestCandidateSenses.elementAt(i);
        context.add(sense);
      }
    }
    return context;
  }
View Full Code Here

TOP

Related Classes of org.wikipedia.miner.model.Article

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.