Package opennlp.tools.util

Examples of opennlp.tools.util.Span


        }

        List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>();

        // The values used in these Spans are string character offsets
        Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText);

        // Each sentence gets processed on its own
        for (Span sentenceSpan : sentenceSpans) {

            // find the start and end position of this sentence in the document
            String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd());

            // tokenize the text into the required OpenNLP format
            String[] tokens = tokenizer.tokenize(sentence);

            //the values used in these Spans are string character offsets of each token from the sentence beginning
            Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence);

            // find the location names in the tokenized text
            // the values used in these Spans are NOT string character offsets, they are indices into the 'tokens' array
            Span names[] = nameFinder.find(tokens);


            //for each name that got found, create our corresponding occurrence
            for (Span name : names) {
View Full Code Here


   
    if (tokenOffset >= sentences.length) {
      return false;
    }
   
    Span sentenceSpan = sentences[tokenOffset];
    clearAttributes();
    int start = sentenceSpan.getStart();
    int end   = sentenceSpan.getEnd();
    termAtt.copyBuffer(inputSentence, start, end-start);
    posIncrAtt.setPositionIncrement(1);
    offsetAtt.setOffset(start, end);
    tokenOffset++;
   
View Full Code Here

  */
  //<end id="ne-remove-conflicts"/>
  @Test
  public void testRemoveConflicts() {
    List<Annotation> annotations = new ArrayList<Annotation>();
    annotations.add(new Annotation("person", new Span(1, 5), 0.75));
    annotations.add(new Annotation("person", new Span(7, 10), 0.95));
    annotations.add(new Annotation("location", new Span(11, 15), 0.85));
    removeConflicts(annotations);
    assertTrue(annotations.size() == 3);
    annotations.add(new Annotation("location", new Span(2, 7), 0.85));
    removeConflicts(annotations);
    assertTrue(annotations.size() == 3);
    assertTrue(((Annotation) annotations.get(0)).getSpan().getStart() == 2);
    annotations.clear();
    annotations.add(new Annotation("person", new Span(1, 5), 0.75));
    annotations.add(new Annotation("person", new Span(7, 10), 0.95));
    annotations.add(new Annotation("location", new Span(11, 15), 0.85));
    annotations.add(new Annotation("person", new Span(3, 8), 0.85));
    removeConflicts(annotations);
    assertTrue(annotations.size() == 2);
    assertTrue(((Annotation) annotations.get(0)).getSpan().getStart() == 7);
  }
View Full Code Here

      Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]); //<co id="co.opennlp.name.tokenizepos"/>
      String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]); //<co id="co.opennlp.name.convert2strings"/>
      Span[] names = finder.find(tokens); //<co id="co.opennlp.name.findnames4"/>

      for (int ni = 0; ni < names.length; ni++) {
        Span startSpan = tokenSpans[names[ni].getStart()]; //<co id="co.opennlp.name.computestart"/>
        int nameStart  = startSpan.getStart();
       
        Span endSpan   = tokenSpans[names[ni].getEnd() - 1]; //<co id="co.opennlp.name.computeend"/>
        int nameEnd    = endSpan.getEnd();
       
        String name = sentences[si].substring(nameStart, nameEnd); //<co id="co.opennlp.name.namestring"/>
        System.out.println(name);
      }
    }
View Full Code Here

    int chunkStart = -1;
    String chunkType = null;
    double logProb=0;
    for (int ci=0,cn=chunks.length;ci<cn;ci++) {
      if (ci > 0 && !chunks[ci].startsWith("I-") && !chunks[ci-1].equals("O")) {
        Span span = new Span(children[chunkStart].getSpan().getStart(),children[ci-1].getSpan().getEnd());
        tokens.insert(new Parse(tokens.getText(), span, chunkType, logProb,children[ci-1]));
        logProb=0;
      }           
      if (chunks[ci].startsWith("B-")) {
        chunkStart = ci;
        chunkType = chunks[ci].substring(2);
      }
      logProb+=Math.log(probs[ci]);
    }
    if (!chunks[chunks.length-1].equals("O")) {
      int ci = chunks.length;
      Span span = new Span(children[chunkStart].getSpan().getStart(),children[ci-1].getSpan().getEnd());
      tokens.insert(new Parse(tokens.getText(), span, chunkType, logProb,children[ci-1]));
    }
    return tokens;
  }
View Full Code Here

  @Override
  protected Iterator<Event> createEvents(TokenSample tokenSample) {

    List<Event> events = new ArrayList<Event>(50);

    Span tokens[] = tokenSample.getTokenSpans();
    String text = tokenSample.getText();

    if (tokens.length > 0) {

      int start = tokens[0].getStart();
      int end = tokens[tokens.length - 1].getEnd();

      String sent = text.substring(start, end);

      Span[] candTokens = WhitespaceTokenizer.INSTANCE.tokenizePos(sent);

      int firstTrainingToken = -1;
      int lastTrainingToken = -1;
      for (Span candToken : candTokens) {
        Span cSpan = candToken;
        String ctok = sent.substring(cSpan.getStart(), cSpan.getEnd());
        //adjust cSpan to text offsets
        cSpan = new Span(cSpan.getStart() + start, cSpan.getEnd() + start);
        //should we skip this token
        if (ctok.length() > 1
          && (!skipAlphaNumerics || !alphaNumeric.matcher(ctok).matches())) {

          //find offsets of annotated tokens inside of candidate tokens
          boolean foundTrainingTokens = false;
          for (int ti = lastTrainingToken + 1; ti < tokens.length; ti++) {
            if (cSpan.contains(tokens[ti])) {
              if (!foundTrainingTokens) {
                firstTrainingToken = ti;
                foundTrainingTokens = true;
              }
              lastTrainingToken = ti;
            }
            else if (cSpan.getEnd() < tokens[ti].getEnd()) {
              break;
            }
            else if (tokens[ti].getEnd() < cSpan.getStart()) {
              //keep looking
            }
            else {
              if (logger.isLoggable(Level.WARNING)) {
                logger.warning("Bad training token: " + tokens[ti] + " cand: " + cSpan +
                    " token="+text.substring(tokens[ti].getStart(), tokens[ti].getEnd()));
              }
            }
          }

          // create training data
          if (foundTrainingTokens) {

            for (int ti = firstTrainingToken; ti <= lastTrainingToken; ti++) {
              Span tSpan = tokens[ti];
              int cStart = cSpan.getStart();
              for (int i = tSpan.getStart() + 1; i < tSpan.getEnd(); i++) {
                String[] context = cg.getContext(ctok, i - cStart);
                events.add(new Event(TokenizerME.NO_SPLIT, context));
              }

              if (tSpan.getEnd() != cSpan.getEnd()) {
                String[] context = cg.getContext(ctok, tSpan.getEnd() - cStart);
                events.add(new Event(TokenizerME.SPLIT, context));
              }
            }
          }
        }
View Full Code Here

      else {
        id = -1;
        // throw invalid format exception ...
      }
       
      mentionStack.push(new CorefMention(new Span(beginOffset, beginOffset), id, attributes.get("MIN")));
    }
  }
View Full Code Here

  @Override
  public void endElement(String name) {
   
    if (COREF_ELEMENT.equals(name)) {
      CorefMention mention = mentionStack.pop();
      mention.span = new Span(mention.span.getStart(), text.size());
      mentions.add(mention);
    }
   
    if (MucElementNames.CONTENT_ELEMENTS.contains(name)) {
     
View Full Code Here

  }
 
  static Parse createIncompleteParse(String tokens[]) {
   
    // produce text
    Span tokenSpans[] = new Span[tokens.length];
    StringBuilder textBuilder = new StringBuilder();
   
    for (int i = 0; i < tokens.length; i++) {
     
      if (textBuilder.length() > 0) {
        textBuilder.append(' ');
      }
     
      int startOffset = textBuilder.length();
      textBuilder.append(tokens[i]);
      tokenSpans[i] = new Span(startOffset, textBuilder.length());
    }
   
    String text = textBuilder.toString();
   
    Parse p = new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 0, 0);
   
    for (int i = 0; i < tokenSpans.length; i++) {
      Span tokenSpan = tokenSpans[i];
      p.insert(new Parse(text, new Span(tokenSpan.getStart(), tokenSpan.getEnd()), AbstractBottomUpParser.TOK_NODE, 0, i));
    }
   
    return p;
  }
View Full Code Here

    List<Span> openNLPSpans = new LinkedList<Span>();

    while (containingTokens.hasNext()) {
      AnnotationFS tokenAnnotation = containingTokens.next();

      openNLPSpans.add(new Span(tokenAnnotation.getBegin()
          - sentence.getBegin(), tokenAnnotation.getEnd()
          - sentence.getBegin()));
    }

    Span[] spans = openNLPSpans.toArray(new Span[openNLPSpans.size()]);
View Full Code Here

TOP

Related Classes of opennlp.tools.util.Span

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.