Examples of org.apache.pdfbox.util.PDFTextStripperByArea

org.apache.pdfbox.util.PDFTextStripperByArea
This will extract text from a specified region in the PDF. @author Ben Litchfield @version $Revision: 1.5 $

                doc = PDDocument.load( args[0] );
                int pageNum = 0;
                for( PDPage page : doc.getPages() )
                {
                    pageNum++;
                    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                    List annotations = page.getAnnotations();
                    //first setup text extraction regions
                    for( int j=0; j<annotations.size(); j++ )
                    {
                        PDAnnotation annot = (PDAnnotation)annotations.get( j );
                        if( annot instanceof PDAnnotationLink )
                        {
                            PDAnnotationLink link = (PDAnnotationLink)annot;
                            PDRectangle rect = link.getRectangle();
                            //need to reposition link rectangle to match text space
                            float x = rect.getLowerLeftX();
                            float y = rect.getUpperRightY();
                            float width = rect.getWidth();
                            float height = rect.getHeight();
                            int rotation = page.getRotation();
                            if( rotation == 0 )
                            {
                                PDRectangle pageSize = page.getMediaBox();
                                y = pageSize.getHeight() - y;
                            }
                            else if( rotation == 90 )
                            {
                                //do nothing
                            }


                            Rectangle2D.Float awtRect = new Rectangle2D.Float( x,y,width,height );
                            stripper.addRegion( "" + j, awtRect );
                        }
                    }


                    stripper.extractRegions( page );


                    for( int j=0; j<annotations.size(); j++ )
                    {
                        PDAnnotation annot = (PDAnnotation)annotations.get( j );
                        if( annot instanceof PDAnnotationLink )
                        {
                            PDAnnotationLink link = (PDAnnotationLink)annot;
                            PDAction action = link.getAction();
                            String urlText = stripper.getTextForRegion( "" + j );
                            if( action instanceof PDActionURI )
                            {
                                PDActionURI uri = (PDActionURI)action;
                                System.out.println( "Page " + pageNum +":'" + urlText + "'=" + uri.getURI() );
                            }

View Full Code Here

                    {
                        System.err.println( "Error: Document is encrypted with a password." );
                        System.exit( 1 );
                    }
                }
                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition( true );
                Rectangle rect = new Rectangle( 10, 280, 275, 60 );
                stripper.addRegion( "class1", rect );
                PDPage firstPage = document.getPage(0);
                stripper.extractRegions( firstPage );
                System.out.println( "Text in the area:" + rect );
                System.out.println( stripper.getTextForRegion( "class1" ) );


            }
            finally
            {
                if( document != null )

View Full Code Here

                document = PDDocument.load( args[0] );
                if( document.isEncrypted() )
                {
                    document.decrypt( "" );
                }
                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition( true );
                Rectangle rect = new Rectangle( 10, 280, 275, 60 );
                stripper.addRegion( "class1", rect );
                List allPages = document.getDocumentCatalog().getAllPages();
                PDPage firstPage = (PDPage)allPages.get( 0 );
                stripper.extractRegions( firstPage );
                System.out.println( "Text in the area:" + rect );
                System.out.println( stripper.getTextForRegion( "class1" ) );


            }
            finally
            {
                if( document != null )

View Full Code Here

     * Code adapted from http://www.docjar.com/html/api/org/apache/pdfbox/examples/pdmodel/PrintURLs.java.html
     */
    private Map<String, PDAction> extractLinks(PDPage page) throws Exception
    {
        Map<String, PDAction> links = new HashMap<String, PDAction>();
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        List<PDAnnotation> annotations = page.getAnnotations();
        // First setup the text extraction regions.
        for (int j = 0; j < annotations.size(); j++) {
            PDAnnotation annotation = annotations.get(j);
            if (annotation instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annotation;
                PDRectangle rect = link.getRectangle();
                // Need to reposition link rectangle to match text space.
                float x = rect.getLowerLeftX();
                float y = rect.getUpperRightY();
                float width = rect.getWidth();
                float height = rect.getHeight();
                int rotation = page.findRotation();
                if (rotation == 0) {
                    PDRectangle pageSize = page.findMediaBox();
                    y = pageSize.getHeight() - y;
                } else if (rotation == 90) {
                    // Do nothing.
                }


                Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
                stripper.addRegion(String.valueOf(j), awtRect);
            }
        }


        stripper.extractRegions(page);


        for (int j = 0; j < annotations.size(); j++) {
            PDAnnotation annotation = annotations.get(j);
            if (annotation instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annotation;
                String label = stripper.getTextForRegion(String.valueOf(j)).trim();
                links.put(label, link.getAction());
            }
        }


        return links;

View Full Code Here

        return destination.toString();
    }


    private String getDestinationText(PDPageXYZDestination destination) throws Exception
    {
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        stripper.addRegion("destination", getRectangleBelowDestination(destination));
        stripper.extractRegions(destination.getPage());
        return stripper.getTextForRegion("destination").trim();
    }

View Full Code Here

            {
                doc = PDDocument.load( args[0] );
                List allPages = doc.getDocumentCatalog().getAllPages();
                for( int i=0; i<allPages.size(); i++ )
                {
                    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                    PDPage page = (PDPage)allPages.get( i );
                    List annotations = page.getAnnotations();
                    //first setup text extraction regions
                    for( int j=0; j<annotations.size(); j++ )
                    {
                        PDAnnotation annot = (PDAnnotation)annotations.get( j );
                        if( annot instanceof PDAnnotationLink )
                        {
                            PDAnnotationLink link = (PDAnnotationLink)annot;
                            PDRectangle rect = link.getRectangle();
                            //need to reposition link rectangle to match text space
                            float x = rect.getLowerLeftX();
                            float y = rect.getUpperRightY();
                            float width = rect.getWidth();
                            float height = rect.getHeight();
                            int rotation = page.findRotation();
                            if( rotation == 0 )
                            {
                                PDRectangle pageSize = page.findMediaBox();
                                y = pageSize.getHeight() - y;
                            }
                            else if( rotation == 90 )
                            {
                                //do nothing
                            }


                            Rectangle2D.Float awtRect = new Rectangle2D.Float( x,y,width,height );
                            stripper.addRegion( "" + j, awtRect );
                        }
                    }


                    stripper.extractRegions( page );


                    for( int j=0; j<annotations.size(); j++ )
                    {
                        PDAnnotation annot = (PDAnnotation)annotations.get( j );
                        if( annot instanceof PDAnnotationLink )
                        {
                            PDAnnotationLink link = (PDAnnotationLink)annot;
                            PDAction action = link.getAction();
                            String urlText = stripper.getTextForRegion( "" + j );
                            if( action instanceof PDActionURI )
                            {
                                PDActionURI uri = (PDActionURI)action;
                                System.out.println( "Page " + (i+1) +":'" + urlText + "'=" + uri.getURI() );
                            }

View Full Code Here

                    {
                        System.err.println( "Error: Document is encrypted with a password." );
                        System.exit( 1 );
                    }
                }
                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition( true );
                Rectangle rect = new Rectangle( 10, 280, 275, 60 );
                stripper.addRegion( "class1", rect );
                List allPages = document.getDocumentCatalog().getAllPages();
                PDPage firstPage = (PDPage)allPages.get( 0 );
                stripper.extractRegions( firstPage );
                System.out.println( "Text in the area:" + rect );
                System.out.println( stripper.getTextForRegion( "class1" ) );


            }
            finally
            {
                if( document != null )

View Full Code Here

TOP

Related Classes of org.apache.pdfbox.util.PDFTextStripperByArea

org.apache.pdfbox.cos.COSStream

org.apache.pdfbox.examples.pdmodel.PrintURLs

org.apache.pdfbox.examples.util.ExtractTextByArea

org.apache.pdfbox.pdmodel.common.PDStream

org.xwiki.test.misc.PDFTest

java.util.ArrayList

java.util.Vector

java.awt.geom.Rectangle2D

java.util.Iterator

java.io.StringWriter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.