Examples of org.apache.pdfbox.text.TextPosition

org.apache.pdfbox.text.TextPosition
This represents a string and a position on the screen of those characters. @author Ben Litchfield

            int ltrCount = 0;
            int rtlCount = 0;


            while (textIter.hasNext())
            {
                TextPosition position = textIter.next();
                String stringValue = position.getUnicode();
                for (int a = 0; a < stringValue.length(); a++)
                {
                    byte dir = Character.getDirectionality(stringValue.charAt(a));
                    if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ||
                            dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING ||
                            dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
                    {
                        ltrCount++;
                    } else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ||
                            dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC ||
                            dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
                            dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
                    {
                        rtlCount++;
                    }
                }
            }
            // choose the dominant direction
            boolean isRtlDominant = rtlCount > ltrCount;


            startArticle(!isRtlDominant);
            startOfArticle = true;
            // we will later use this to skip reordering
            boolean hasRtl = rtlCount > 0;


            // Now cycle through to print the text.
            // We queue up a line at a time before we print so that we can convert
            // the line from presentation form to logical form (if needed).
            List<LineItem> line = new ArrayList<LineItem>();


            textIter = textList.iterator();    // start from the beginning again
            // PDF files don't always store spaces. We will need to guess where we should add
            // spaces based on the distances between TextPositions. Historically, this was done
            // based on the size of the space character provided by the font. In general, this
            // worked but there were cases where it did not work. Calculating the average character
            // width and using that as a metric works better in some cases but fails in some cases
            // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
            // these examples.


            // Keeps track of the previous average character width
            float previousAveCharWidth = -1;
            while (textIter.hasNext())
            {
                TextPosition position = textIter.next();
                PositionWrapper current = new PositionWrapper(position);
                String characterValue = position.getUnicode();


                // Resets the average character width when we see a change in font
                // or a change in the font size
                if (lastPosition != null &&
                        (position.getFont() != lastPosition.getTextPosition().getFont() ||
                                position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
                {
                    previousAveCharWidth = -1;
                }


                float positionX;
                float positionY;
                float positionWidth;
                float positionHeight;


                // If we are sorting, then we need to use the text direction
                // adjusted coordinates, because they were used in the sorting.
                if (getSortByPosition())
                {
                    positionX = position.getXDirAdj();
                    positionY = position.getYDirAdj();
                    positionWidth = position.getWidthDirAdj();
                    positionHeight = position.getHeightDir();
                } else
                {
                    positionX = position.getX();
                    positionY = position.getY();
                    positionWidth = position.getWidth();
                    positionHeight = position.getHeight();
                }


                // The current amount of characters in a word
                int wordCharCount = position.getIndividualWidths().length;


                // Estimate the expected width of the space based on the
                // space character with some margin.
                float wordSpacing = position.getWidthOfSpace();
                float deltaSpace;
                if (wordSpacing == 0 || wordSpacing == Float.NaN)
                {
                    deltaSpace = Float.MAX_VALUE;
                } else

View Full Code Here

            {
                // test if we overlap the previous entry.
                // Note that we are making an assumption that we need to only look back
                // one TextPosition to find what we are overlapping.
                // This may not always be true. */
                TextPosition previousTextPosition = textList.get(textList.size() - 1);
                if (text.isDiacritic() && previousTextPosition.contains(text))
                {
                    previousTextPosition.mergeDiacritic(text);
                }
                // If the previous TextPosition was the diacritic, merge it into this
                // one and remove it from the list.
                else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                {
                    text.mergeDiacritic(previousTextPosition);
                    textList.remove(textList.size()-1);
                    textList.add(text);
                }

View Full Code Here

     * @param pw position
     * @return the matching pattern
     */
    protected Pattern matchListItemPattern(PositionWrapper pw) 
    {
        TextPosition tp = pw.getTextPosition();
        String txt = tp.getUnicode();
        return matchPattern(txt,getListItemPatterns());
    }

View Full Code Here

            lineBuilder = new StringBuilder();
            wordPositions.clear();
        }
        else 
        {
            TextPosition text = item.getTextPosition();
            lineBuilder.append(text.getUnicode());
            wordPositions.add(text);
        }
        return lineBuilder;
    }

View Full Code Here

            while (textIter.hasNext())
            {
                Iterator<TextPosition> textByArticle = textIter.next().iterator();
                while (textByArticle.hasNext())
                {
                    TextPosition position = textByArticle.next();


                    float currentFontSize = position.getFontSize();
                    //If we're past 64 chars we will assume that we're past the title
                    //64 is arbitrary
                    if (currentFontSize != lastFontSize || titleText.length() > 64)
                    {
                        if (titleText.length() > 0)
                        {
                            return titleText.toString();
                        }
                        lastFontSize = currentFontSize;
                    }
                    if (currentFontSize > 13.0f)
                    { // most body text is 12pt
                        titleText.append(position.getUnicode());
                    }
                }
            }
        }
        return "";

View Full Code Here

            //
            boolean suppressCharacter = false;
            float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
            for (TextPosition sameTextCharacter : sameTextCharacters)
            {
                TextPosition character = (TextPosition) sameTextCharacter;
                String charCharacter = character.getUnicode();
                float charX = character.getX();
                float charY = character.getY();
                //only want to suppress
                if( charCharacter != null &&
                        //charCharacter.equals( textCharacter ) &&
                        within( charX, textX, tolerance ) &&
                        within( charY,
                                textY,
                                tolerance ) )
                {
                    suppressCharacter = true;
                    break;
                }
            }
            if( !suppressCharacter )
            {
                sameTextCharacters.add( text );
                showCharacter = true;
            }
        }


        if( showCharacter )
        {
            List<TextPosition> textList = new ArrayList<TextPosition>();


            /* In the wild, some PDF encoded documents put diacritics (accents on
             * top of characters) into a separate Tj element.  When displaying them
             * graphically, the two chunks get overlayed.  With text output though,
             * we need to do the overlay. This code recombines the diacritic with
             * its associated character if the two are consecutive.
             */ 
            if(textList.isEmpty())
            {
                textList.add(text);
            }
            else
            {
                /* test if we overlap the previous entry.  
                 * Note that we are making an assumption that we need to only look back
                 * one TextPosition to find what we are overlapping.  
                 * This may not always be true. */
                TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
                if(text.isDiacritic() && previousTextPosition.contains(text))
                {
                    previousTextPosition.mergeDiacritic(text);
                }
                /* If the previous TextPosition was the diacritic, merge it into this
                 * one and remove it from the list. */
                else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                {
                    text.mergeDiacritic(previousTextPosition);
                    textList.remove(textList.size()-1);
                    textList.add(text);
                }

View Full Code Here

                // skips them. See the "allah2.pdf" TestTextStripper file.
                return;
            }
        }


        processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(),
                pageSize.getHeight(), textRenderingMatrix, nextX, nextY,
                dyDisplay, dxDisplay,
                spaceWidthDisplay, unicode, new int[] { code } , font, fontSize,
                (int)(fontSize * textRenderingMatrix.getXScale())));
    }

View Full Code Here

TOP

Related Classes of org.apache.pdfbox.text.TextPosition

org.apache.pdfbox.contentstream.PDFTextStreamEngine

org.apache.pdfbox.tools.PDFText2HTML

org.apache.pdfbox.util.PDFMarkedContentExtractor

org.apache.pdfbox.util.PDFTextStripper

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.