section = sections.next();
tokens.clear(); //clear token for each section (STANBOL-818)
Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
ChunkData activeChunk = null;
while(enclosed.hasNext()){
Span span = enclosed.next();
if(span.getStart() >= span.getEnd()){ //save guard against empty spans
log.warn("Detected Empty Span {} in section {} of Blob {}",
new Object[]{span,section, at.getBlob()});
}
if(span.getType() == SpanTypeEnum.Chunk){
ChunkData chunkData = new ChunkData((Chunk)span);
if(chunkData.isProcessable){
if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
log.info(" - merge overlapping and processable Chunks {} <-> {}",
activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
activeChunk.merged = (Chunk)span; //set this one as last merged
} //ignore completely covered chunks
} else { // a new Chunk starts
activeChunk = chunkData;
activeChunk.startToken = tokens.size();
if(log.isDebugEnabled()){
log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
new Object []{
activeChunk.chunk.getType(),
activeChunk.startToken,
activeChunk.chunk.getSpan()
});
}
}
} //else ignore chunks that are not processable
} else if(span.getType() == SpanTypeEnum.Token){
TokenData tokenData = new TokenData(tokens.size(),(Token)span,activeChunk);
if(log.isDebugEnabled()){
log.debug(" > Token {}: {} (pos:{}) chunk: '{}' | morpho: {}",
new Object[]{tokenData.index,tokenData.token,
tokenData.token.getAnnotations(POS_ANNOTATION),
tokenData.inChunk != null ? tokenData.inChunk.chunk.getSpan() : "none",
tokenData.morpho != null ? tokenData.morpho : "none"});
}
tokens.add(tokenData);
if(!foundProcessable){
foundProcessable = tokenData.isProcessable;
}
if(activeChunk != null){
if(tokenData.isMatchable ){
activeChunk.matchableCount++;
}
if (span.getEnd() >= activeChunk.getEndChar()){
//this is the last token in the current chunk
activeChunk.endToken = tokens.size()-1;
log.debug(" - end Chunk@pos: {}", activeChunk.endToken);
if(tpc.isLinkMultiMatchableTokensInChunk() &&
activeChunk.matchableCount > 1 ){