Package org.apache.poi.hwpf.extractor

Examples of org.apache.poi.hwpf.extractor.WordExtractor


    public Document[] parse(final MultiProtocolURI location, final String mimeType,
            final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {

        final WordExtractor extractor;

        try {
            extractor = new WordExtractor(source);
        } catch (Exception e) {
            throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
        }

        final StringBuilder contents = new StringBuilder(80);
        try {
            contents.append(extractor.getText().trim());
            contents.append(' ');
            contents.append(extractor.getHeaderText());
            contents.append(' ');
            contents.append(extractor.getFooterText());
        } catch (Exception e) {
            throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
        }
        String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
        title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
        if (title.length() > 80) title = title.substring(0, 80);
        int l = title.length();
        while (true) {
            title = title.replaceAll("  ", " ");
            if (title.length() == l) break;
            l = title.length();
        }

        Document[] docs;
        docs = new Document[]{new Document(
                  location,
                  mimeType,
                  "UTF-8",
                  this,
                  null,
                  null,
                  title,
                  "", // TODO: AUTHOR
                  extractor.getDocSummaryInformation().getCompany(), // publisher
                  null,
                  null,
                  0.0f, 0.0f,
                  UTF8.getBytes(contents.toString()),
                  null,
View Full Code Here


    }
  }
 
  private void collectWordDocument(POIFSFileSystem filesystem, StringBuilder sb)
    throws IOException {
    WordExtractor extractor = new WordExtractor(filesystem);
    addTextIfAny(sb, extractor.getHeaderText());
    for (String paragraph : extractor.getParagraphText()) {
        sb.append(paragraph).append(' ');
    }

    for (String paragraph : extractor.getFootnoteText()) {
        sb.append(paragraph).append(' ');
    }

    for (String paragraph : extractor.getCommentsText()) {
        sb.append(paragraph).append(' ');
    }

    for (String paragraph : extractor.getEndnoteText()) {
        sb.append(paragraph).append(' ');
    }
    addTextIfAny(sb, extractor.getFooterText());
  }
View Full Code Here

    @Override
    public void run() {
        InputStream isr = null;
        try {
            isr = new FileInputStream(pathToFile);
            WordExtractor word = new WordExtractor(isr);
            String fileContent = "";
            String[] paragraphes = word.getParagraphText();
            for (String paragraph : paragraphes) {
                fileContent += " " + paragraph;
            }
            AddDataToIndex AddDataToIndex = new AddDataToIndex(null);
            AddDataToIndex.doAddData(fileContent, pathToFile, fileName);
View Full Code Here

            String name = entry.getName();
            if (!(entry instanceof DocumentEntry)) {
                // Skip directory entries
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);

                addTextIfAny(xhtml, "header", extractor.getHeaderText());

                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getFootnoteText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getCommentsText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getEndnoteText()) {
                    xhtml.element("p", paragraph);
                }

                addTextIfAny(xhtml, "footer", extractor.getFooterText());
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                Locale locale = context.get(Locale.class, Locale.getDefault());
                new ExcelExtractor().parse(filesystem, xhtml, locale);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                // TODO: Cleaner mechanism for detecting Outlook
                outlookExtracted = true;
View Full Code Here

     
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {
View Full Code Here

                  xhtml.element("p", extractor.getText());
               }
            } else if (entry instanceof DocumentEntry) {
               if ("WordDocument".equals(name)) {
                   setType(metadata, "application/msword");
                   WordExtractor extractor = new WordExtractor(filesystem);

                   addTextIfAny(xhtml, "header", extractor.getHeaderText());

                   for (String paragraph : extractor.getParagraphText()) {
                       xhtml.element("p", paragraph);
                   }

                   for (String paragraph : extractor.getFootnoteText()) {
                       xhtml.element("p", paragraph);
                   }

                   for (String paragraph : extractor.getCommentsText()) {
                       xhtml.element("p", paragraph);
                   }

                   for (String paragraph : extractor.getEndnoteText()) {
                       xhtml.element("p", paragraph);
                   }

                   addTextIfAny(xhtml, "footer", extractor.getFooterText());
               } else if ("PowerPoint Document".equals(name)) {
                   setType(metadata, "application/vnd.ms-powerpoint");
                   PowerPointExtractor extractor =
                       new PowerPointExtractor(filesystem);
                   xhtml.element("p", extractor.getText(true, true));
               } else if ("Workbook".equals(name)) {
                   setType(metadata, "application/vnd.ms-excel");
                   Locale locale = context.get(Locale.class, Locale.getDefault());
                   new ExcelExtractor().parse(filesystem, xhtml, locale);
               } else if ("VisioDocument".equals(name)) {
                   setType(metadata, "application/vnd.visio");
                   VisioTextExtractor extractor =
                       new VisioTextExtractor(filesystem);
                   for (String text : extractor.getAllText()) {
                       xhtml.element("p", text);
                   }
               } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                   // TODO: Cleaner mechanism for detecting Outlook
                   outlookExtracted = true;
View Full Code Here

    } else {
      throw new IllegalArgumentException("Parameter must be instance of byte[]");
    }
    String ret = null;
    try {
      WordExtractor docextractor = new WordExtractor(is);
      ret = docextractor.getText();
    } catch (OldWordFileFormatException e) {
      try {
        is.reset();
        Word6Extractor docextractor = new Word6Extractor(is);
        ret = docextractor.getText();
      } catch (IOException e1) {
        throw new CRException(e1);
      }

    } catch (IOException e) {
View Full Code Here

     
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {
View Full Code Here

            String name = entry.getName();
            if (!(entry instanceof DocumentEntry)) {
                // Skip directory entries
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);

                addTextIfAny(xhtml, "header", extractor.getHeaderText());

                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getFootnoteText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getCommentsText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getEndnoteText()) {
                    xhtml.element("p", paragraph);
                }

                addTextIfAny(xhtml, "footer", extractor.getFooterText());
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                new ExcelExtractor().parse(filesystem, xhtml);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                // TODO: Cleaner mechanism for detecting Outlook
                outlookExtracted = true;
View Full Code Here

     * Bug 33519 - HWPF fails to read a file
     */
    public void test33519()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug33519.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        extractor.getText();
    }
View Full Code Here

TOP

Related Classes of org.apache.poi.hwpf.extractor.WordExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.