Package org.dbpedia.spotlight.tagging

Examples of org.dbpedia.spotlight.tagging.TaggedToken


   *
   * @param surfaceFormOccurrence the surface form in context
   * @return POS tag of the next token
   */
  public static Integer nextPOS(SurfaceFormOccurrence surfaceFormOccurrence) {
    TaggedToken rightNeighbour = null;

    try {
      rightNeighbour = ((TaggedText) surfaceFormOccurrence.context()).taggedTokenProvider()
          .getRightNeighbourToken(surfaceFormOccurrence);
    } catch (ItemNotFoundException e) {
      return null;
    }

    if (rightNeighbour.getPOSTag().startsWith("vb"))
      return 0;
    else if(rightNeighbour.getToken().equals("of"))
      return 1;
    else
      return null;

  }
View Full Code Here


   * @param surfaceFormOccurrence the surface form in context
   * @return surface form is part of enumeration
   */
  public static boolean isInEnumeration(SurfaceFormOccurrence surfaceFormOccurrence) {

    TaggedToken leftNeighbour;
    TaggedToken rightNeighbour;

    try {
      leftNeighbour = ((TaggedText) surfaceFormOccurrence.context()).taggedTokenProvider()
          .getLeftNeighbourToken(surfaceFormOccurrence);
      rightNeighbour = ((TaggedText) surfaceFormOccurrence.context()).taggedTokenProvider()
          .getRightNeighbourToken(surfaceFormOccurrence);
    } catch (ItemNotFoundException e) {
      return false;
    }

    if(leftNeighbour.getPOSTag().equals(",") &&  (rightNeighbour.getPOSTag().equals("cc") || rightNeighbour.getPOSTag().equals("m")))
      return true;
    else
      return false;
  }
View Full Code Here

    TaggedText text = (TaggedText) surfaceFormOccurrence.context();
    List<TaggedToken> candidateTokens = text.taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence);
    int termSize = candidateTokens.size();

    TaggedToken firstTaggedToken = candidateTokens.get(0);
    CandidateData firstTaggedTokenData = null;
    try {
      firstTaggedTokenData = dataProvider.getCandidateData(firstTaggedToken.getToken());
    } catch (ItemNotFoundException e) {
      //No information about the token!
    }


    CandidateData secondTaggedTokenData = null;
    if(candidateTokens.size() > 1){
      TaggedToken secondTaggedToken = candidateTokens.get(1);
      try {
        secondTaggedTokenData = dataProvider.getCandidateData(secondTaggedToken.getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token!
      }
    }

    TaggedToken lastTaggedToken = candidateTokens.get(candidateTokens.size()-1);
    CandidateData lastTaggedTokenData = null;
    try {
      lastTaggedTokenData = dataProvider.getCandidateData(lastTaggedToken.getToken());
    } catch (ItemNotFoundException e) {
      //No information about the token!
    }


    CandidateData lastBut1TaggedTokenData = null;

    if(candidateTokens.size() > 1) {
      TaggedToken lastBut1TaggedToken = candidateTokens.get(candidateTokens.size()-2);
      try {
        lastBut1TaggedTokenData = dataProvider.getCandidateData(lastBut1TaggedToken.getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token!
      }
    }


    /**
     * Left context
     */

    List<TaggedToken> leftContext = null;
    try {
      leftContext = text.taggedTokenProvider().getLeftContext(surfaceFormOccurrence, 2);
    } catch (ItemNotFoundException ignored) {}

    CandidateData left1 = null;
    if(leftContext != null && leftContext.size() > 0) {
      try {
        String token;
        if(leftContext.size() == 1) {
          /**
           * There are no more tokens to the left, the token is sentence initial.
           */
          token = leftContext.get(0).getToken().toLowerCase();

        }else{
          token = leftContext.get(0).getToken();

        }
        left1 = dataProvider.getCandidateData(token);
      } catch (ItemNotFoundException e) {
        //No information about the token
      }
    }


    /**
     * Right context
     */

    List<TaggedToken> rightContext = null;
    try {
      rightContext = text.taggedTokenProvider().getRightContext(surfaceFormOccurrence, 2);
    } catch (ItemNotFoundException ignored) {}

    CandidateData right1 = null;
    if(rightContext != null && rightContext.size() > 0) {
      try {
        right1 = dataProvider.getCandidateData(rightContext.get(0).getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token
      }
    }



    /**
     * Features:
     */

    if(termSize == 2) {

      try {
        if(firstTaggedTokenData != null && secondTaggedTokenData != null) {
          CoOccurrenceData bigramData = dataProvider.getBigramData(firstTaggedTokenData, secondTaggedTokenData);

          //if (bigramData.getUnitCountWeb() > bigramLeftWebMin)
            instance.setValue(i(count_web, buildAttributeList()), bigramData.getUnitCountWeb());
        }
      } catch (ItemNotFoundException ignored) {}
      catch (ArrayIndexOutOfBoundsException ignored) {}



    }



    List<String> verbs = new LinkedList<String>();

    boolean allLowercase = surfaceFormOccurrence.surfaceForm().name().toLowerCase().equals(surfaceFormOccurrence.surfaceForm().name());
    boolean allUppercase = surfaceFormOccurrence.surfaceForm().name().toUpperCase().equals(surfaceFormOccurrence.surfaceForm().name());

    int capitalizedWords = 0;

    for(TaggedToken candidateToken : candidateTokens) {
      if(candidateToken.getPOSTag().startsWith("v") || candidateToken.getPOSTag().equals("be")) {
        verbs.add(candidateToken.getPOSTag());
      }

      if(Character.isUpperCase(candidateToken.getToken().charAt(0)))
        capitalizedWords++;
    }

    try{
      if(verbs.size() > 1)
        instance.setValue(i(contains_verb, buildAttributeList()), 5);
      else if(verbs.size()==0)
        instance.setValue(i(contains_verb, buildAttributeList()), 0);
      else if(verbs.get(0).equals("vb"))
        instance.setValue(i(contains_verb, buildAttributeList()), 1);
      else if(verbs.get(0).equals("vbd"))
        instance.setValue(i(contains_verb, buildAttributeList()), 2);
      else if(verbs.get(0).equals("vbg"))
        instance.setValue(i(contains_verb, buildAttributeList()), 3);
      else if(verbs.get(0).equals("vbn"))
        instance.setValue(i(contains_verb, buildAttributeList()), 4);
      else if(verbs.get(0).equals("be"))
        instance.setValue(i(contains_verb, buildAttributeList()), 5);
    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try{
      if(allLowercase)
        instance.setValue(i(term_case, buildAttributeList()), 0);
      else if(allUppercase)
        instance.setValue(i(term_case, buildAttributeList()), 3);
      else if(capitalizedWords == candidateTokens.size())
        instance.setValue(i(term_case, buildAttributeList()), 2);
      else if(capitalizedWords == 1 && Character.isUpperCase(candidateTokens.get(0).getToken().charAt(0)))
        instance.setValue(i(term_case, buildAttributeList()), 4);
      else
        instance.setValue(i(term_case, buildAttributeList()), 1);


    } catch (ArrayIndexOutOfBoundsException ignored) {}


    try{
      instance.setValue(i(candidate_size, buildAttributeList()), termSize);
    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try {
      TaggedToken leftNeighbourToken = text.taggedTokenProvider().getLeftNeighbourToken(surfaceFormOccurrence);

      if(leftNeighbourToken.getPOSTag().equals("to")) {
        instance.setValue(i(pre_pos, buildAttributeList()), 0);
      }
      else if(leftNeighbourToken.getPOSTag().matches("[mnf].*")) {
        instance.setValue(i(pre_pos, buildAttributeList()), 1);
      }else if(leftNeighbourToken.getToken().matches("[aA][nN]?")) {
        instance.setValue(i(pre_pos, buildAttributeList()), 2);
      }

    } catch (ItemNotFoundException ignored) {

    } catch (ArrayIndexOutOfBoundsException ignored) {}


    try {

      if(leftContext.size() > 0) {

        if(leftContext.get(0).getPOSTag().equals("to")) {
          instance.setValue(i(pre_pos, buildAttributeList()), 0);
        }
        else if(leftContext.get(0).getPOSTag().matches("[mnf].*")) {
          instance.setValue(i(pre_pos, buildAttributeList()), 1);
        }else if(leftContext.get(0).getToken().matches("[aA][nN]?")) {
          instance.setValue(i(pre_pos, buildAttributeList()), 2);
        }
      }

    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try{
      if (CandidateFeatures.quoted(surfaceFormOccurrence) == 1)
        instance.setValue(i(quoted, buildAttributeList()), 0);

    } catch (ArrayIndexOutOfBoundsException ignored) {}




    try {
      if(rightContext.size() > 0) {

        if(rightContext.get(0).getToken().equals("of")) {
          instance.setValue(i(next_pos, buildAttributeList()), 0);
        }else if(rightContext.get(0).getToken().equals("to")) {
          instance.setValue(i(next_pos, buildAttributeList()), 1);
        }else if(rightContext.get(0).getPOSTag().startsWith("be")) {
          instance.setValue(i(next_pos, buildAttributeList()), 2);
        }else if(rightContext.get(0).getPOSTag().startsWith("v")) {
          instance.setValue(i(next_pos, buildAttributeList()), 3);
        }
      }
    } catch (ArrayIndexOutOfBoundsException ignored) {}



    try {
      TaggedToken lastToken = candidateTokens.get(candidateTokens.size() - 1);


      if(lastToken.getPOSTag().equals("in")) {
        instance.setValue(i(ends_with, buildAttributeList()), 0);
      }
    } catch (ArrayIndexOutOfBoundsException ignored) {}


View Full Code Here

TOP

Related Classes of org.dbpedia.spotlight.tagging.TaggedToken

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.