Package org.dbpedia.spotlight.model

Examples of org.dbpedia.spotlight.model.TaggedText


    @Override
    protected Text buildText(Text text) {
      if(text instanceof TaggedText)
        return text;
      else
        return new TaggedText(text.text(), tagger);
    }
View Full Code Here


  @Override
  /** {@inheritDoc} */
  public Instance buildInstance(SurfaceFormOccurrence surfaceFormOccurrence, Instance instance) {

    TaggedText text = (TaggedText) surfaceFormOccurrence.context();
    List<TaggedToken> candidateTokens = text.taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence);
    int termSize = candidateTokens.size();

    TaggedToken firstTaggedToken = candidateTokens.get(0);
    CandidateData firstTaggedTokenData = null;
    try {
      firstTaggedTokenData = dataProvider.getCandidateData(firstTaggedToken.getToken());
    } catch (ItemNotFoundException e) {
      //No information about the token!
    }


    CandidateData secondTaggedTokenData = null;
    if(candidateTokens.size() > 1){
      TaggedToken secondTaggedToken = candidateTokens.get(1);
      try {
        secondTaggedTokenData = dataProvider.getCandidateData(secondTaggedToken.getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token!
      }
    }

    TaggedToken lastTaggedToken = candidateTokens.get(candidateTokens.size()-1);
    CandidateData lastTaggedTokenData = null;
    try {
      lastTaggedTokenData = dataProvider.getCandidateData(lastTaggedToken.getToken());
    } catch (ItemNotFoundException e) {
      //No information about the token!
    }


    CandidateData lastBut1TaggedTokenData = null;

    if(candidateTokens.size() > 1) {
      TaggedToken lastBut1TaggedToken = candidateTokens.get(candidateTokens.size()-2);
      try {
        lastBut1TaggedTokenData = dataProvider.getCandidateData(lastBut1TaggedToken.getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token!
      }
    }


    /**
     * Left context
     */

    List<TaggedToken> leftContext = null;
    try {
      leftContext = text.taggedTokenProvider().getLeftContext(surfaceFormOccurrence, 2);
    } catch (ItemNotFoundException ignored) {}

    CandidateData left1 = null;
    if(leftContext != null && leftContext.size() > 0) {
      try {
        String token;
        if(leftContext.size() == 1) {
          /**
           * There are no more tokens to the left, the token is sentence initial.
           */
          token = leftContext.get(0).getToken().toLowerCase();

        }else{
          token = leftContext.get(0).getToken();

        }
        left1 = dataProvider.getCandidateData(token);
      } catch (ItemNotFoundException e) {
        //No information about the token
      }
    }


    /**
     * Right context
     */

    List<TaggedToken> rightContext = null;
    try {
      rightContext = text.taggedTokenProvider().getRightContext(surfaceFormOccurrence, 2);
    } catch (ItemNotFoundException ignored) {}

    CandidateData right1 = null;
    if(rightContext != null && rightContext.size() > 0) {
      try {
        right1 = dataProvider.getCandidateData(rightContext.get(0).getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token
      }
    }



    /**
     * Features:
     */

    if(termSize == 2) {

      try {
        if(firstTaggedTokenData != null && secondTaggedTokenData != null) {
          CoOccurrenceData bigramData = dataProvider.getBigramData(firstTaggedTokenData, secondTaggedTokenData);

          //if (bigramData.getUnitCountWeb() > bigramLeftWebMin)
            instance.setValue(i(count_web, buildAttributeList()), bigramData.getUnitCountWeb());
        }
      } catch (ItemNotFoundException ignored) {}
      catch (ArrayIndexOutOfBoundsException ignored) {}



    }



    List<String> verbs = new LinkedList<String>();

    boolean allLowercase = surfaceFormOccurrence.surfaceForm().name().toLowerCase().equals(surfaceFormOccurrence.surfaceForm().name());
    boolean allUppercase = surfaceFormOccurrence.surfaceForm().name().toUpperCase().equals(surfaceFormOccurrence.surfaceForm().name());

    int capitalizedWords = 0;

    for(TaggedToken candidateToken : candidateTokens) {
      if(candidateToken.getPOSTag().startsWith("v") || candidateToken.getPOSTag().equals("be")) {
        verbs.add(candidateToken.getPOSTag());
      }

      if(Character.isUpperCase(candidateToken.getToken().charAt(0)))
        capitalizedWords++;
    }

    try{
      if(verbs.size() > 1)
        instance.setValue(i(contains_verb, buildAttributeList()), 5);
      else if(verbs.size()==0)
        instance.setValue(i(contains_verb, buildAttributeList()), 0);
      else if(verbs.get(0).equals("vb"))
        instance.setValue(i(contains_verb, buildAttributeList()), 1);
      else if(verbs.get(0).equals("vbd"))
        instance.setValue(i(contains_verb, buildAttributeList()), 2);
      else if(verbs.get(0).equals("vbg"))
        instance.setValue(i(contains_verb, buildAttributeList()), 3);
      else if(verbs.get(0).equals("vbn"))
        instance.setValue(i(contains_verb, buildAttributeList()), 4);
      else if(verbs.get(0).equals("be"))
        instance.setValue(i(contains_verb, buildAttributeList()), 5);
    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try{
      if(allLowercase)
        instance.setValue(i(term_case, buildAttributeList()), 0);
      else if(allUppercase)
        instance.setValue(i(term_case, buildAttributeList()), 3);
      else if(capitalizedWords == candidateTokens.size())
        instance.setValue(i(term_case, buildAttributeList()), 2);
      else if(capitalizedWords == 1 && Character.isUpperCase(candidateTokens.get(0).getToken().charAt(0)))
        instance.setValue(i(term_case, buildAttributeList()), 4);
      else
        instance.setValue(i(term_case, buildAttributeList()), 1);


    } catch (ArrayIndexOutOfBoundsException ignored) {}


    try{
      instance.setValue(i(candidate_size, buildAttributeList()), termSize);
    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try {
      TaggedToken leftNeighbourToken = text.taggedTokenProvider().getLeftNeighbourToken(surfaceFormOccurrence);

      if(leftNeighbourToken.getPOSTag().equals("to")) {
        instance.setValue(i(pre_pos, buildAttributeList()), 0);
      }
      else if(leftNeighbourToken.getPOSTag().matches("[mnf].*")) {
View Full Code Here

TOP

Related Classes of org.dbpedia.spotlight.model.TaggedText

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.