case XMLStreamConstants.START_ELEMENT:
++level;
textOffset += extractor.startElement(reader.getQName());
break;
case XMLStreamConstants.CHARACTERS:
NodeId nodeId = (NodeId) reader.getProperty(ExtendedXMLStreamReader.PROPERTY_NODE_ID);
textOffset += extractor.beforeCharacters();
offsets.add(textOffset, nodeId);
textOffset += extractor.characters(reader.getXMLText());
break;
}
}
} catch (IOException | XMLStreamException e) {
LOG.warn("Problem found while serializing XML: " + e.getMessage(), e);
}
// Retrieve the Analyzer for the NodeProxy that was used for
// indexing and querying.
Analyzer analyzer = idxConf.getAnalyzer();
if (analyzer == null) {
// Otherwise use system default Lucene analyzer (from conf.xml)
// to tokenize the text and find matching query terms.
analyzer = index.getDefaultAnalyzer();
}
LOG.debug("Analyzer: " + analyzer + " for path: " + path);
String str = extractor.getText().toString();
//Token token;
try {
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(str));
tokenStream.reset();
MarkableTokenFilter stream = new MarkableTokenFilter(tokenStream);
while (stream.incrementToken()) {
String text = stream.getAttribute(CharTermAttribute.class).toString();
Query query = termMap.get(text);
if (query != null) {
// Phrase queries need to be handled differently to filter
// out wrong matches: only the phrase should be marked, not
// single words which may also occur elsewhere in the document
if (query instanceof PhraseQuery) {
PhraseQuery phraseQuery = (PhraseQuery) query;
Term[] terms = phraseQuery.getTerms();
if (text.equals(terms[0].text())) {
// Scan the following text and collect tokens to see
// if they are part of the phrase.
stream.mark();
int t = 1;
List<State> stateList = new ArrayList<>(terms.length);
stateList.add(stream.captureState());
while (stream.incrementToken() && t < terms.length) {
text = stream.getAttribute(CharTermAttribute.class).toString();
if (text.equals(terms[t].text())) {
stateList.add(stream.captureState());
if (++t == terms.length) {
break;
}
} else {
// Don't reset the token stream since we will
// miss matches. /ljo
//stream.reset();
break;
}
}
if (stateList.size() == terms.length) {
// we indeed have a phrase match. record the offsets of its terms.
int lastIdx = -1;
for (int i = 0; i < terms.length; i++) {
stream.restoreState(stateList.get(i));
OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class);
int idx = offsets.getIndex(offsetAttr.startOffset());
NodeId nodeId = offsets.ids[idx];
Offset offset = nodesWithMatch.get(nodeId);
if (offset != null)
if (lastIdx == idx)
offset.setEndOffset(offsetAttr.endOffset() - offsets.offsets[idx]);
else
offset.add(offsetAttr.startOffset() - offsets.offsets[idx],
offsetAttr.endOffset() - offsets.offsets[idx]);
else
nodesWithMatch.put(nodeId, new Offset(offsetAttr.startOffset() - offsets.offsets[idx],
offsetAttr.endOffset() - offsets.offsets[idx]));
lastIdx = idx;
}
}
} // End of phrase handling
} else {
OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class);
int idx = offsets.getIndex(offsetAttr.startOffset());
NodeId nodeId = offsets.ids[idx];
Offset offset = nodesWithMatch.get(nodeId);
if (offset != null)
offset.add(offsetAttr.startOffset() - offsets.offsets[idx],
offsetAttr.endOffset() - offsets.offsets[idx]);
else {