Source Code of com.jpetrak.gate.stringannotation.extendedgazetteer2.Chunk

package com.jpetrak.gate.stringannotation.extendedgazetteer2;


import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Utils;
import gate.util.GateRuntimeException;


import java.util.Arrays;
import java.util.List;


/** 
 * Represents a chunk of text to generate gazetteer matches for.
 * The text is either a part of the original document text or text generated
 * from the content of a feature of the word annotations.
 * The text is made up of the parts that come from each wholly contained
 * word annotation separated by a space for each wholly contained
 * space annotation.
 * 
 * @author Johann Petrak
 *
 */
public class Chunk {
  private Chunk() { }
  
  private static final boolean debug = false;
  
  private char[] text;
  // an array of offsets that indicate for each character in our text, at which 
  // offset an annotation would have to start, if this character were at the beginning
  // of the match. If the text is created from the feature value of the word annotations,
  // then all offsets for all of the characters of the text from one annotation will
  // refer to the start offset of that annotation.
  // NOTE: this int ALSO stores information about whether that location in the text
  // indicates a word begginning. We use the sign bit for this and store the offset
  // in the following way: 
  // No word beginning: the offset, unchanged (which is always >= 0)
  // Word beginning: -(offset+1) (offset 0->-1, offset 1->-2 etc.)
  private int[] startOffsets;
  // an array of offsets that indicate for each character in our text, at which 
  // offset an annotation would have to end, if this character were at the end
  // of the match. If the text is created from the feature value of the word annotations,
  // then all offsets for all of the characters of the text from one annotation will
  // refer to the end offset of that annotation.
  // NOTE: this int ALSO stores information about whether that location in the text
  // indicates a word ending. We use the sign bit for this and store the offset
  // in the following way: 
  // No word ending: the offset, unchanged (which is always >= 0)
  // Word ending: -(offset+1) (offset 0->-1, offset 1->-2 etc.)
  private int[] endOffsets;
  
  private int length;
  private int initialLength;
  private int from;
  private int to;
  
  /**
   * Get the original document start offset for this offset in the text
   * 
   * @param off
   * @return
   */
  public int getStartOffset(int off) {
    int tmp = startOffsets[off];
    if(tmp < 0) {
      return (-tmp)-1;
    } else { 
      return tmp;
    }
  }
  /**
   * Get the original document end offset for this offset in the text
   * 
   * @param off
   * @return
   */
  public int getEndOffset(int off) {
    int tmp = endOffsets[off];
    if(tmp < 0) {
      return (-tmp)-1;
    } else { 
      return tmp;
    }
  }
  
  // store the original start offset for the text offset
  private void putStartOffset(int textoff, int origoff) {
    guardOffsetP(textoff);
    startOffsets[textoff] = origoff;
  }
  // store the original end offset for the text offset
  private void putEndOffset(int textoff, int origoff) {
    guardOffsetP(textoff);
    endOffsets[textoff] = origoff;
  }
  
  private void putTextChar(int textoff, char ch) {
    guardOffsetP(textoff);
    text[textoff] = ch;
  }
  
  
  // set this text offset as a word start
  private void setIsValidMatchStart(int textoff) {
    guardOffsetP(textoff);
    int tmp = startOffsets[textoff];
    if(tmp >= 0) {
      startOffsets[textoff] = -(tmp+1);
    }
  }
  // set this text offset as a word end
  private void setIsValidMatchEnd(int textoff) {
    guardOffsetP(textoff);
    int tmp = endOffsets[textoff];
    if(tmp >= 0) {
      endOffsets[textoff] = -(tmp+1);
    }
  }
  
  public boolean isValidMatchStart(int offset) {
    guardOffset(offset);
    if(startOffsets[offset] < 0) {
      return true;
    } else {
      return false;
    }
  }


  public boolean isValidMatchEnd(int offset) {
    guardOffset(offset);
    if(endOffsets[offset] < 0) {
      return true;
    } else {
      return false;
    }
  }
  
  
  public char getCharAt(int textoff) {
    guardOffset(textoff);
    return text[textoff];
  }
  
  private void guardOffset(int off) {
    if(!(off < length) || off<0) {
      throw new GateRuntimeException("Attempt to use invalid chunk text offset: off="+off+" length="+length);
    }    
  }
  // if necessary, enlarge our arrays to hold more data!
  private void guardOffsetP(int off) {
    if(!(off < text.length)) {
      if(debug) {
        System.out.println("enlarging arrays!");
      }
      int newLength = text.length+initialLength;
      char[] newtext = new char[newLength];
      System.arraycopy(text, 0, newtext, 0, text.length);
      text = newtext;
      int[] newStarts = new int[newLength];
      System.arraycopy(startOffsets, 0, newStarts, 0, startOffsets.length);
      startOffsets = newStarts;
      int[] newEnds   = new int[newLength];
      System.arraycopy(endOffsets, 0, newEnds, 0, endOffsets.length);
      endOffsets = newEnds;
    }    
  }
  
  public static Chunk makeChunk(
      Document document, long fromOffset, long toOffset,
      boolean caseNormalize,
      AnnotationSet processAnns, String wordAnnotationType, String wordAnnotationFeature, String spaceAnnotationType,
      boolean startWithWordStart, boolean endWithWordEnd, 
      String matchStartFeature, String matchEndFeature, String matchTypeFeature) {
    
    // TODO: at the moment we do not do language specific case normalization here because this would
    // make it more complex to keep track of the indices -- String.toUpper(locale) can change the size of the string!
    // Instead we use char.toUpper which is not perfect but makes things easier!
    Chunk chunk = new Chunk();
    boolean haveFeature = !(wordAnnotationFeature == null || wordAnnotationFeature.equals(""));
    String doctext = document.getContent().toString();
    chunk.initialLength = Math.max((int)(toOffset-fromOffset),500);
    chunk.from = (int)fromOffset;
    chunk.to = (int)toOffset;
    chunk.text = new char[chunk.initialLength];
    chunk.endOffsets = new int[chunk.initialLength];
    chunk.startOffsets = new int[chunk.initialLength];
    AnnotationSet actualAnns = processAnns.get(fromOffset,toOffset);
    AnnotationSet wordAnns = actualAnns.get(wordAnnotationType);
    if(wordAnns.isEmpty()) {
      chunk.length = 0;
      return chunk;
    }
    List<Annotation> actualAnnsList = Utils.inDocumentOrder(actualAnns);
    int i = 0; // index into the text, startOffsets and endOffsets arrays
    boolean firstSpace = true;
    if(debug) {
      System.out.println("Number of total processAnns="+processAnns.size());
      System.out.println("Number of anns in range="+actualAnns.size());
    }
    
    for(Annotation actualAnn : actualAnnsList) {
      int curStart = actualAnn.getStartNode().getOffset().intValue();
      int curEnd   = actualAnn.getEndNode().getOffset().intValue();
      String curType = actualAnn.getType();
      if(curType.equals(spaceAnnotationType)) {
        if(firstSpace) {
          // add a space 
          chunk.putTextChar(i,' ');
          // was: chunk.text[i] = ' ';
          i++;
          firstSpace = false;
        }        
      } else {
        // we have a word
        if(debug) {
          System.out.println("Add word annotation: "+actualAnn);
          System.out.println("i="+i);
        }
        
        String wordText = null;
        
        // if a matchTypeFeature is specified:
        if(matchTypeFeature != null && !matchTypeFeature.isEmpty()) {
          Object val = actualAnn.getFeatures().get(matchTypeFeature);
          if(val != null) {
            String valstr = val.toString();
            if(valstr.equals("n") || valstr.isEmpty()) {
              // this means just normal processing, do nothing special
            } else if(valstr.equals("i")) {
              // ignore the word as if it was not here at all
              continue;
            } else if(valstr.equals("f")) {
              // fail: never match this word!
              // we achieve this by a little trick: we just set the first character
              // of the word to char=0 -- since 0 can never occur in a gazetteer entry,
              // all matches will fail at the first character of the word
              wordText = new String(new char[] { 0 });
            }
          }
        }


        // unless we havent set the text already to 0 earlier
        if (wordText == null) {
          if (haveFeature) {
            wordText = (String) actualAnn.getFeatures().get(
                wordAnnotationFeature);
            if (debug) {
              System.out.println("Got feature value=" + wordText);
            }
          } else {
            wordText = gate.Utils.stringFor(document, actualAnn);
            if (debug) {
              System.out.println("Got document text=" + wordText);
            }
          }
        }
        if(wordText == null || wordText.isEmpty()) {
          if(debug) {
            System.out.println("Skipping: no text!");
          }
          continue;
        }


        // TODO: if the wordStartFeature is given, get its value and set the default
        // for validMatchStart accordingly, otherwise set it to true.
        // Same for wordEndFeature
        boolean validMatchEndDefault = true;
        boolean validMatchStartDefault = true;
        
        if(matchEndFeature != null && !matchEndFeature.isEmpty()) {
          Object val = actualAnn.getFeatures().get(matchEndFeature);
          if(val != null) {
            String valstr = val.toString();
            if(valstr.isEmpty() || valstr.equals("false") || valstr.equals("0")) {
              validMatchEndDefault = false;
            }
          } else {
            validMatchEndDefault = false;
          }          
        }
        if(matchStartFeature != null && !matchStartFeature.isEmpty()) {
          Object val = actualAnn.getFeatures().get(matchStartFeature);
          if(val != null) {
            String valstr = val.toString();
            if(valstr.isEmpty() || valstr.equals("false") || valstr.equals("0")) {
              validMatchStartDefault = false;
            }
          } else {
            validMatchStartDefault = false;
          }
        }
        
        firstSpace = true; 
        // add the wordText
        char[] wordTextChars = wordText.toCharArray();
        for(int j=0;j<wordTextChars.length;j++) {
          char c = wordTextChars[j];
          if(caseNormalize) {
            c = Character.toUpperCase(c);
          }
          chunk.putTextChar(i, wordTextChars[j]);
          // was: chunk.text[i] = wordTextChars[j];
          if(haveFeature) {
            chunk.putStartOffset(i, curStart);
            // was: chunk.startOffsets[i] = curStart; 
            chunk.putEndOffset(i, curEnd-1);
            // was: chunk.endOffsets[i] = curEnd-1;
          } else {
            chunk.putStartOffset(i, curStart+j);
            // was: chunk.startOffsets[i] = curStart+j;
            chunk.putEndOffset(i, curStart+j);
            // chunk.endOffsets[i] = curStart+j;
          }
          


          // If the word specifically set matchStartFeature to true,
          // then set the default for this being a valid match to true,
          // otherwise set it to false.
          // Same for matchEndFeauture.
          boolean validMatchStart = validMatchStartDefault;
          boolean validMatchEnd = validMatchEndDefault;
          
          // if startWithWordStart is set, then make sure we do not have 
          // a valid match start unless we are at the first character of the word
          if(startWithWordStart && j!=0) {
            validMatchStart = false;
          }
          // if startWithWordEnd is set, then make sure we do not have 
          // a valid match end unless we are at the last character of the word
          if(endWithWordEnd && j!=(wordTextChars.length-1)) {
            validMatchEnd = false;
          }
          if(validMatchStart) {
            chunk.setIsValidMatchStart(i);
          }
          if(validMatchEnd) {
            chunk.setIsValidMatchEnd(i);
          }
          i++;
        }
      }
    } // for actualAnn
    if(debug) {
      System.out.println("Created chunk: "+chunk);
    }
    chunk.length = i;
    return chunk;
  }
  public static Chunk makeChunk(Document document, Annotation ann, boolean caseNormalise,
      AnnotationSet processAnns, String wordAnnotationType, String wordAnnotationFeature, String spaceAnnotationType,
      boolean startWithWordStart, boolean endWithWordEnd, String wordStartFeature, String wordEndFeature,
      String wordFeature) {
    return makeChunk(document, ann.getStartNode().getOffset(),ann.getEndNode().getOffset(),caseNormalise, processAnns, 
        wordAnnotationType,wordAnnotationFeature,spaceAnnotationType,
        startWithWordStart, endWithWordEnd, wordStartFeature, wordEndFeature, wordFeature);
  }
 
  public int getLength() {
    return length;
  }
  
  public String getTextString(int from, int to) {
    guardOffset(from);
    guardOffset(to);
    return new String(Arrays.copyOfRange(text, from, to+1));
  }
  
  public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append("\nChunk: ");
    sb.append("from=").append(from).append(" to=").append(to).append(" length="+length);
    sb.append("\ntext=");
    sb.append(new String(text));
    sb.append("\nstartOffsets: ");
    for(int i=0; i<length; i++ ) {
      sb.append(getStartOffset(i)).append(",");
    }
    sb.append("\nstartWord: ");
    for(int i=0; i<length; i++ ) {
      sb.append(isValidMatchStart(i)).append(",");
    }
    sb.append("\nendOffsets: ");
    for(int i=0; i<length; i++ ) {
      sb.append(getEndOffset(i)).append(",");
    }
    sb.append("\nendWord: ");
    for(int i=0; i<length; i++ ) {
      sb.append(isValidMatchEnd(i)).append(",");
    }
    return sb.toString();
  }
  
  public boolean isEmpty() {
    return (length == 0);
  }
  
}
Source Code of com.jpetrak.gate.stringannotation.extendedgazetteer2.Chunk

Related Classes of com.jpetrak.gate.stringannotation.extendedgazetteer2.Chunk