}
private void decodeTextWordlist(int startPage, int endPage, String output_dir,boolean useXMLExtraction) {
PdfDecoder decode_pdf=null;
//PdfDecoder returns a PdfException if there is a problem
try {
decode_pdf = new PdfDecoder(false);
decode_pdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
decode_pdf.init(true);
//always reset to use unaltered co-ords - allow use of rotated or unrotated
// co-ordinates on pages with rotation (used to be in PdfDecoder)
PdfGroupingAlgorithms.useUnrotatedCoords=false;
/**
* open the file (and read metadata including pages in file)
*/
decode_pdf.openPdfFile(selectedFile);
} catch (PdfSecurityException e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+selectedFile);
} catch (PdfException e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+selectedFile);
} catch (Exception e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+selectedFile);
e.printStackTrace();
}
/**
* extract data from pdf (if allowed).
*/
if ((decode_pdf.isEncrypted()&&(!decode_pdf.isPasswordSupplied()))&& (!decode_pdf.isExtractionAllowed())) {
System.out.println("Encrypted settings");
System.out.println("Please look at SimpleViewer for code sample to handle such files");
} else{
//page range
int start = startPage, end = endPage;
int wordsExtracted=0;
ProgressMonitor status = new ProgressMonitor(currentGUI.getFrame(),
Messages.getMessage("PdfViewerMessage.ExtractText"),"",startPage,endPage);
/**
* extract data from pdf
*/
try {
int count=0;
boolean yesToAll = false;
for (int page = start; page < end + 1; page++) { //read pages
if(status.isCanceled()){
currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
return;
}
//decode the page
decode_pdf.decodePage(page);
/** create a grouping object to apply grouping to data*/
PdfGroupingAlgorithms currentGrouping =decode_pdf.getGroupingObject();
/**use whole page size for demo - get data from PageData object*/
PdfPageData currentPageData = decode_pdf.getPdfPageData();
int x1 = currentPageData.getMediaBoxX(page);
int x2 = currentPageData.getMediaBoxWidth(page)+x1;
int y2 = currentPageData.getMediaBoxX(page);
int y1 = currentPageData.getMediaBoxHeight(page)-y2;
/**Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
/**The call to extract the list*/
List words =null;
try{
words =currentGrouping.extractTextAsWordlist(
x1,
y1,
x2,
y2,
page,
true,"&:=()!;.,\\/\"\"\'\'");
} catch (PdfException e) {
decode_pdf.closePdfFile();
System.err.println("Exception= "+ e+" in "+selectedFile);
e.printStackTrace();
}catch(Error e){
e.printStackTrace();
}
if (words == null) {
System.out.println("No text found");
} else {
String target=output_dir+separator+"wordlist"+separator;
//create a directory if it doesn't exist
File output_path = new File(target);
if (output_path.exists() == false)
output_path.mkdirs();
/**
* choose correct prefix
*/
String prefix="_text.txt";
String encoding=System.getProperty("file.encoding");
if(useXMLExtraction){
prefix="_xml.txt";
encoding="UTF-8";
}
/**each word is stored as 5 consecutive values (word,x1,y1,x2,y2)*/
int wordCount=words.size()/5;
//update our count
wordsExtracted=wordsExtracted+wordCount;
File fileToSave = new File(target + fileName+ '_' +page + prefix);
if(fileToSave.exists() && !yesToAll){
if((endPage - startPage) > 1){
int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),true);
if(n==0){
// clicked yes so just carry on for this once
}else if(n==1){
// clicked yes to all, so set flag
yesToAll = true;
}else if(n==2){
// clicked no, so loop round again
status.setProgress(page);
continue;
}else{
currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
status.close();
return;
}
}else{
int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),false);
if(n==0){
// clicked yes so just carry on
}else{
// clicked no, so exit
return;
}
}
}
/**
* output the data
*/
OutputStreamWriter output_stream =
new OutputStreamWriter(
new FileOutputStream(target + fileName+ '_' +page + prefix),
encoding);
Iterator wordIterator=words.iterator();
while(wordIterator.hasNext()){
String currentWord=(String) wordIterator.next();
/**remove the XML formatting if present - not needed for pure text*/
if(!useXMLExtraction)
currentWord=Strip.convertToText(currentWord,true);
int wx1=(int)Float.parseFloat((String) wordIterator.next());
int wy1=(int)Float.parseFloat((String) wordIterator.next());
int wx2=(int)Float.parseFloat((String) wordIterator.next());
int wy2=(int)Float.parseFloat((String) wordIterator.next());
/**this could be inserting into a database instead*/
output_stream.write(currentWord+ ',' +wx1+ ',' +wy1+ ',' +wx2+ ',' +wy2+ '\n');
}
output_stream.close();
}
count++;
status.setProgress(page+1);
//remove data once written out
decode_pdf.flushObjectValues(false);
}
status.close();
currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.TextSavedTo")+ ' ' +output_dir);
} catch (Exception e) {
decode_pdf.closePdfFile();
System.err.println("Exception "+ e+" in "+selectedFile);
e.printStackTrace();
}catch(Error e){
e.printStackTrace();
}
}
/**close the pdf file*/
decode_pdf.closePdfFile();
decode_pdf=null;
}