boolean foundFirst = false;
boolean useRegEx = false;
//Search result and teaser holders
Vector_Float resultCoords = new Vector_Float(0);
Vector_String resultTeasers = new Vector_String(0);
//Extract the text data into local arrays for searching
copyToArrays();
//Remove any hidden text on page as should not be found
cleanupShadowsAndDrownedObjects(false);
//Get unused text objects and sort them for correct searching
int[] items = getsortedUnusedFragments(true, false);
/**
* check orientation and get preferred. Items not correct will be
* ignored
*/
int l2r = 0;
int r2l = 0;
int t2b = 0;
int b2t = 0;
for(int i=0; i!=items.length; i++){
switch(writingMode[items[i]]){
case 0 :l2r++; break;
case 1 :r2l++; break;
case 2 :t2b++; break;
case 3 :b2t++; break;
}
}
int[] unsorted = new int[]{l2r, r2l, t2b, b2t};
int[] sorted = new int[]{l2r, r2l, t2b, b2t};
//Set all to -1 so we can tell if it's been set yet
int[] writingModes = new int[]{-1,-1,-1,-1};
Arrays.sort(sorted);
for(int i=0; i!= unsorted.length; i++){
for(int j=0; j < sorted.length; j++){
if(unsorted[i]==sorted[j]){
int pos = j - 3;
if(pos<0)
pos=-pos;
if(writingModes[pos]==-1){
writingModes[pos] = i;
j=sorted.length;
}
}
}
}
for(int u=0; u!=writingModes.length; u++){
int writingMode = writingModes[u];
//if not lines for writing mode, ignore
if(unsorted[writingMode]!=0){
//Merge text fragments into lines as displayed on page
createLines(items.length, items, writingMode, true, false, true);
//Bitwise flags for regular expressions engine, options always required
int options = 0;
//Turn on case sensitive mode
if((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE){
options =(options | Pattern.CASE_INSENSITIVE);
}
//Only find first occurance of each search term
if((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY){
firstOccuranceOnly = true;
}
//Only find whole words, not partial words
if((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY){
wholeWordsOnly = true;
}
//Allow search to find split line results
if((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS){
options =(options | Pattern.MULTILINE | Pattern.DOTALL);
}
//Allow the use of regular expressions symbols
if((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS){
useRegEx = true;
}
/**
* create local copies of arrays
*/
float[] f_y1 = this.f_y1, f_y2 = this.f_y2;
/**
* swap around x and y so rountine works on all cases
*/
boolean valuesSwapped = false;
if (writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
f_y1 = this.f_y1;
f_y2 = this.f_y2;
} else if (writingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
f_y1 = this.f_y1;
f_y2 = this.f_y2;
} else if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
f_y1 = this.f_x2;
f_y2 = this.f_x1;
valuesSwapped = true;
} else if (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
f_y2 = this.f_x1;
f_y1 = this.f_x2;
valuesSwapped = true;
}
//Portions of text to perform the search on and find teasers
String[] searchText;
String[] coordsText;
//Merge all text into one with \n line separators
//This will allow checking for multi line split results
String plain = "";
String raw = "";
for(int i=0; i!=content.length; i++){
if(content[i]!=null && writingMode == this.writingMode[i]){
raw += content[i] +"\n";
plain += content[i] +"\n";
}
}
//Remove double spaces, replacing them with single spaces
raw = removeDuplicateSpaces(raw);
plain = removeDuplicateSpaces(plain);
//Strip xml from content and keep coords and text data
raw = Strip.stripXML(raw,isXMLExtraction).toString();
//Strip xml and coords data from content and keep text data
plain = removeHiddenMarkers(plain);
plain = Strip.stripXML(plain,isXMLExtraction).toString();
//Store text in the search and teaser arrays
searchText = new String[]{plain};
coordsText = new String[]{raw};
//Hold starting point data at page rotation
Point resultStart;
//Work through the search terms one at a time
for(int j=0; j!=terms.length; j++){
String searchValue = terms[j];
//Set the default separator between words in a search term
String sep = " ";
//Multiline needs space or newline to be recognised as word separators
if((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS){
sep = "[ \\\\n]";
}
//if not using reg ex add reg ex literal flags around the text and word separators
if(!useRegEx){
searchValue = "\\Q"+searchValue+"\\E";
sep = "\\\\E"+sep+"\\\\Q";
}
//If word seperator has changed, replace all spaces with modified seperator
if(!sep.equals(" ")){
searchValue = searchValue.replaceAll(" ", sep);
}
//Surround search term with word boundry tags to match whole words
if(wholeWordsOnly)
searchValue = "\\b"+searchValue+"\\b";
//Create pattern to match search term
Pattern searchTerm = Pattern.compile(searchValue, options);
//Create pattern to match search term with two words before and after
Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*"+searchValue+"\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?", options);
//Loop through all search text
for(int i=0; i!=searchText.length; i++){
//Get text data and text+coord data
String plainText = searchText[i];
String coordText = coordsText[i];
//So long as text data is not null
if(plainText!=null){
//Create two matchers for finding search term and teaser
Matcher termFinder = searchTerm.matcher(plainText);
Matcher teaserFinder = teaserTerm.matcher(plainText);
boolean needToFindTeaser = true;
//Keep looping till no result is returned
while(termFinder.find()){
resultStart = null;
//Make note of the text found and index in the text
String foundTerm = termFinder.group();
int termStarts = termFinder.start();
int termEnds = termFinder.end()-1;
//If storing teasers
if(includeTease){
//Store the term found as a default value
String teaser = foundTerm;
if(includeHTMLtags)
teaser = "<b>"+teaser+"</b>";
boolean itemFound = false;
if(needToFindTeaser){
itemFound = teaserFinder.find();
}
if(itemFound){
//Get a teaser if found and set the search term to bold is allowed
if(teaserFinder.start()<termStarts && teaserFinder.end()>termEnds){
//replace default with found teaser
teaser = teaserFinder.group();
if(includeHTMLtags){
//Calculate points to add bold tags
int teaseStarts = termStarts-teaserFinder.start();
int teaseEnds = (termEnds-teaserFinder.start())+1;
//Add bold tags
teaser = teaser.substring(0, teaseStarts) + "<b>" +
teaser.substring(teaseStarts, teaseEnds) + "</b>" +
teaser.substring(teaseEnds, teaser.length());
}
needToFindTeaser = true;
}else{
needToFindTeaser = false;
}
}
//Store teaser
resultTeasers.addElement(teaser);
}
//Get coords of found text for highlights
float currentX = 0;
float width = 0;
//Track point in text data line (without coord data)
int pointInLine = -1;
//Track line on page
int lineCounter = 0;
//Skip null values and value not in the correct writing mode to ensure correct result coords
while(content[lineCounter]==null || writingMode!=this.writingMode[lineCounter])
lineCounter++;
//Flags used to catch if result is split accross lines
boolean startFound = false;
boolean endFound = false;
//Cycle through coord text looking for coords of this result
//Ignore first value as it is known to be the first marker
for(int pointer=1; pointer<coordText.length(); pointer++){
// find second marker and get x coord
int startPointer = pointer;
while (pointer < coordText.length()) {
if (coordText.charAt(pointer) == MARKER2)
break;
pointer++;
}
//Convert text to float value for x coord
currentX = Float.parseFloat(coordText.substring(startPointer, pointer));
pointer++;
// find third marker and get width
startPointer = pointer;
while (pointer < coordText.length()) {
if (coordText.charAt(pointer) == MARKER2)
break;
pointer++;
}
//Convert text to float value for character width
width = Float.parseFloat(coordText.substring(startPointer, pointer));
pointer++;
// find fourth marker and get text (character)
startPointer = pointer;
while (pointer < coordText.length()) {
if (coordText.charAt(pointer) == MARKER2)
break;
pointer++;
}
//Store text to check for newline character later
String text = coordText.substring(startPointer, pointer);
pointInLine+=text.length();
//Start of term not found yet.
//Point in line is equal to or greater than start of the term.
//Store coords and mark start as found.
if(!startFound && pointInLine>=termStarts){
resultStart = new Point((int)currentX, (int)f_y1[lineCounter]);
startFound = true;
}
//End of term not found yet.
//Point in line is equal to or greater than end of the term.
//Store coords and mark end as found.
if(!endFound && pointInLine>=termEnds){
if (valuesSwapped){
if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(0.0f);
} else {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(0.0f);
}
}else{
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(currentX + width);
resultCoords.addElement(f_y2[lineCounter]);
resultCoords.addElement(0.0f);
}
endFound = true;
}
//Using multi line option.
//Start of term found.
//End of term not found.
//New line character found.
//Set up multi line result.
if(startFound && !endFound && text.contains("\n")){
//Set ends coords
if (valuesSwapped){
if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(linkedSearchAreas); //Mark next result as linked
} else {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(linkedSearchAreas); //Mark next result as linked
}
}else{
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(currentX + width);
resultCoords.addElement(f_y2[lineCounter]);
resultCoords.addElement(linkedSearchAreas); //Mark next result as linked
}
//Set start of term as not found
startFound = false;
//Set this point in line as start of next term
//Guarantees next character is found as
//start of the next part of the search term
termStarts = pointInLine;
}
//In multiline mode we progress the line number when we find a \n
//This is to allow the correct calculation of y coords
if(text.contains("\n")){
lineCounter++;
//If current content pointed at is null or not the correct writing mode, skip value until data is found
while(lineCounter<content.length && (content[lineCounter]==null || writingMode!=this.writingMode[lineCounter])){
lineCounter++;
}
}
}
//If only finding first occurance,
//Stop searching this text data for search term.
if(firstOccuranceOnly){
foundFirst = true;
break;
}
}
//If only finding first occurance and first is found,
//Stop searching all text data for this search term.
if(firstOccuranceOnly && foundFirst){
break;
}
}
}
}
//Remove any trailing empty values
resultCoords.trim();
//If including tease values
if(includeTease){
//Remove any trailing empty values
resultTeasers.trim();
//Store teasers so they can be retrieved by different search methods
if (usingMultipleTerms){
//Store all teasers for so they may be returned as a sorted map
//Only used for one method controled by the above flag
for(int i=0; i!=resultTeasers.size(); i++)
multipleTermTeasers.add(resultTeasers.elementAt(i));
}else{
//Store all teasers to be retrieved by getTeaser() method
teasers = resultTeasers.get();
}
}
}
}
//Return coord data for search results