Package org.apache.poi.hwpf.extractor

Examples of org.apache.poi.hwpf.extractor.WordExtractor


                resourceURL = new URL(url);
                is = resourceURL.openStream();
            }

            POIFSFileSystem fs = new POIFSFileSystem(is);
            WordExtractor extractor = new WordExtractor(fs);
            String wordText = extractor.getText();

            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", wordText, Field.Store.NO, Field.Index.TOKENIZED));
View Full Code Here


        // get metadata and text
        FileInputStream fin = null;
        try {
            fin = new FileInputStream(filename);

            WordExtractor we = new WordExtractor(fin);

            // get meta data
            SummaryInformation si = we.getSummaryInformation();
            documentAuthor = si.getAuthor();
            documentTitle = si.getTitle();
            documentKeywords = si.getKeywords();

            // get text
            documentText = we.getText();
        }
        catch (IOException ioe) {
            log.error("parse() failed at Word file=" + filename, ioe);
            throw new ConverterException("Word::parse() failed at Word file=" + filename, ioe);
        }
View Full Code Here

     * Bug 33519 - HWPF fails to read a file
     */
    public void test33519()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug33519.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        extractor.getText();
    }
View Full Code Here

     * Bug 34898 - WordExtractor doesn't read the whole string from the file
     */
    public void test34898()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug34898.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        assertEquals( "\u30c7\u30a3\u30ec\u30af\u30c8\u30ea", extractor
                .getText().trim() );
    }
View Full Code Here

     * Bug 44331 - HWPFDocument.write destroys fields
     */
    public void test44431()
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
        WordExtractor extractor1 = new WordExtractor( doc1 );

        HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack( doc1 );
        WordExtractor extractor2 = new WordExtractor( doc2 );

        assertEquals( extractor1.getFooterText(), extractor2.getFooterText() );
        assertEquals( extractor1.getHeaderText(), extractor2.getHeaderText() );
        assertEquals( Arrays.toString( extractor1.getParagraphText() ),
                Arrays.toString( extractor2.getParagraphText() ) );

        assertEquals( extractor1.getText(), extractor2.getText() );
    }
View Full Code Here

     * Bug 44331 - HWPFDocument.write destroys fields
     */
    public void test44431_2()
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
        WordExtractor extractor1 = new WordExtractor( doc1 );

        assertEquals( "File name=FieldsTest.doc\n" +
            "\n" +
            "\n" +
            "STYLEREF test\n" +
            "\n" +
            "\n" +
            "\n" +
            "TEST TABLE OF CONTENTS\n" +
            "\n" +
            "Heading paragraph in next page\t2\n" +
            "Another heading paragraph in further page\t3\n" +
            "Another heading paragraph in further page\t3\n" +
            "\n" +
            "\n" +
            "Heading paragraph in next page\n" +
            "Another heading paragraph in further page\n" +
            "\n" +
            "\n" +
            "\n" +
            "Page 3 of 3", extractor1.getText() );
    }
View Full Code Here

     * Bug 45473 - HWPF cannot read file after save
     */
    public void test45473()
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug45473.doc" );
        String text1 = new WordExtractor( doc1 ).getText().trim();

        HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack( doc1 );
        String text2 = new WordExtractor( doc2 ).getText().trim();

        // the text in the saved document has some differences in line
        // separators but we tolerate that
        assertEquals( text1.replaceAll( "\n", "" ), text2.replaceAll( "\n", "" ) );
    }
View Full Code Here

     * missing
     */
    public void test46817()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug46817.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        String text = extractor.getText().trim();

        assertTrue( text.contains( "Nazwa wykonawcy" ) );
        assertTrue( text.contains( "kujawsko-pomorskie" ) );
        assertTrue( text.contains( "ekomel@ekomel.com.pl" ) );
    }
View Full Code Here

     * @throws IOException
     */
    public void test47286() throws IOException
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug47286.doc" );
        String text1 = new WordExtractor( doc1 ).getText().trim();

        HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack( doc1 );
        String text2 = new WordExtractor( doc2 ).getText().trim();

        // the text in the saved document has some differences in line
        // separators but we tolerate that
        assertEquals( text1.replaceAll( "\n", "" ), text2.replaceAll( "\n", "" ) );

View Full Code Here

     * some website as an embedded object
     */
    public void test47731() throws Exception
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
        String foundText = new WordExtractor( doc ).getText();

        assertTrue( foundText
                .contains( "Soak the rice in water for three to four hours" ) );
    }
View Full Code Here

TOP

Related Classes of org.apache.poi.hwpf.extractor.WordExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.