Package org.apache.poi.xssf.eventusermodel

Source Code of org.apache.poi.xssf.eventusermodel.XLSX2CSV$MyXSSFSheetHandler

/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

package org.apache.poi.xssf.eventusermodel;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.ArrayList;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
* A rudimentary XLSX -> CSV processor modeled on the
* POI sample program XLS2CSVmra by Nick Burch from the
* package org.apache.poi.hssf.eventusermodel.examples.
* Unlike the HSSF version, this one completely ignores
* missing rows.
* <p/>
* Data sheets are read using a SAX parser to keep the
* memory footprint relatively small, so this should be
* able to read enormous workbooks.  The styles table and
* the shared-string table must be kept in memory.  The
* standard POI styles table class is used, but a custom
* (read-only) class is used for the shared string table
* because the standard POI SharedStringsTable grows very
* quickly with the number of unique strings.
* <p/>
* Thanks to Eric Smith for a patch that fixes a problem
* triggered by cells with multiple "t" elements, which is
* how Excel represents different formats (e.g., one word
* plain and one word bold).
*
* @author Chris Lott
*/
public class XLSX2CSV {

    /**
     * The type of the data value is indicated by an attribute on the cell.
     * The value is usually in a "v" element within the cell.
     */
    enum xssfDataType {
        BOOL,
        ERROR,
        FORMULA,
        INLINESTR,
        SSTINDEX,
        NUMBER,
    }

    /**
     * Each cell is enclosed in "si".  Each cell can have multiple "t" elements.
     * Example input
     *
     * <pre>
   &lt;?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
  &lt;sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="2" uniqueCount="2">
    &lt;si>
      &lt;r>
        &lt;rPr>
          &lt;b />
          &lt;sz val="11" />
          &lt;color theme="1" />
            &lt;rFont val="Calibri" />
            &lt;family val="2" />
            &lt;scheme val="minor" />
          &lt;/rPr>
          &lt;t>This:&lt;/t>
        &lt;/r>
      &lt;r>
        &lt;rPr>
            &lt;sz val="11" />
          &lt;color theme="1" />
            &lt;rFont val="Calibri" />
            &lt;family val="2" />
            &lt;scheme val="minor" />
          &lt;/rPr>
          &lt;t xml:space="preserve">Causes Problems&lt;/t>
        &lt;/r>
      &lt;/si>
    &lt;si>
        &lt;t>This does not&lt;/t>
      &lt;/si>
    &lt;/sst>
     * </pre>
     *
     */
    static class ReadonlySharedStringsTable extends DefaultHandler {

        /**
         * An integer representing the total count of strings in the workbook. This count does not
         * include any numbers, it counts only the total of text strings in the workbook.
         */
        private int count;

        /**
         * An integer representing the total count of unique strings in the Shared String Table.
         * A string is unique even if it is a copy of another string, but has different formatting applied
         * at the character level.
         */
        private int uniqueCount;

        /**
         * The shared strings table.
         */
        private String[] strings;

        /**
         * @param pkg
         * @throws IOException
         * @throws SAXException
         * @throws ParserConfigurationException
         */
        public ReadonlySharedStringsTable(OPCPackage pkg)
                throws IOException, SAXException, ParserConfigurationException {
            ArrayList<PackagePart> parts =
                    pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
            PackagePart sstPart = parts.get(0);
            readFrom(sstPart.getInputStream());
        }

        /**
         * Like POIXMLDocumentPart constructor
         *
         * @param part
         * @param rel_ignored
         * @throws IOException
         */
        public ReadonlySharedStringsTable(PackagePart part, PackageRelationship rel_ignored)
                throws IOException, SAXException, ParserConfigurationException {
            readFrom(part.getInputStream());
        }

        /**
         * Read this shared strings table from an XML file.
         *
         * @param is The input stream containing the XML document.
         * @throws IOException                  if an error occurs while reading.
         * @throws SAXException
         * @throws ParserConfigurationException
         */
        public void readFrom(InputStream is) throws IOException, SAXException, ParserConfigurationException {
            InputSource sheetSource = new InputSource(is);
            SAXParserFactory saxFactory = SAXParserFactory.newInstance();
            SAXParser saxParser = saxFactory.newSAXParser();
            XMLReader sheetParser = saxParser.getXMLReader();
            sheetParser.setContentHandler(this);
            sheetParser.parse(sheetSource);
        }

        /**
         * Return an integer representing the total count of strings in the workbook. This count does not
         * include any numbers, it counts only the total of text strings in the workbook.
         *
         * @return the total count of strings in the workbook
         */
        public int getCount() {
            return this.count;
        }

        /**
         * Returns an integer representing the total count of unique strings in the Shared String Table.
         * A string is unique even if it is a copy of another string, but has different formatting applied
         * at the character level.
         *
         * @return the total count of unique strings in the workbook
         */
        public int getUniqueCount() {
            return this.uniqueCount;
        }

        /**
         * Return a string item by index
         *
         * @param idx index of item to return.
         * @return the item at the specified position in this Shared String table.
         */
        public String getEntryAt(int idx) {
            return strings[idx];
        }

        //// ContentHandler methods ////

        private StringBuffer characters;
        private boolean tIsOpen;
        private int index;

        /*
       * (non-Javadoc)
       * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
       */
        public void startElement(String uri, String localName, String name,
                                 Attributes attributes) throws SAXException {
            if ("sst".equals(name)) {
                String count = attributes.getValue("count");
                String uniqueCount = attributes.getValue("uniqueCount");
                this.count = Integer.parseInt(count);
                this.uniqueCount = Integer.parseInt(uniqueCount);
                this.strings = new String[this.uniqueCount];
                index = 0;
                characters = new StringBuffer();
            } else if ("si".equals(name)) {
                characters.setLength(0);
            } else if ("t".equals(name)) {
                tIsOpen = true;
            }
        }

        /*
       * (non-Javadoc)
       * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
       */
        public void endElement(String uri, String localName, String name)
                throws SAXException {
            if ("si".equals(name)) {
                strings[index] = characters.toString();             
                ++index;
            } else if ("t".equals(name)) {
              tIsOpen = false;
            }
        }

        /**
         * Captures characters only if a t(ext?) element is open.
         */
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            if (tIsOpen)
                characters.append(ch, start, length);
        }

    }

    /**
     * Derived from http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api
     * <p/>
     * Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at
     * http://www.ecma-international.org/publications/standards/Ecma-376.htm
     * <p/>
     * A web-friendly version is http://openiso.org/Ecma/376/Part4
     */
    class MyXSSFSheetHandler extends DefaultHandler {

        /**
         * Table with styles
         */
        private StylesTable stylesTable;

        /**
         * Table with unique strings
         */
        private ReadonlySharedStringsTable sharedStringsTable;

        /**
         * Destination for data
         */
        private final PrintStream output;

        /**
         * Number of columns to read starting with leftmost
         */
        private final int minColumnCount;

        // Set when V start element is seen
        private boolean vIsOpen;

        // Set when cell start element is seen;
        // used when cell close element is seen.
        private xssfDataType nextDataType;

        // Used to format numeric cell values.
        private short formatIndex;
        private String formatString;
        private final DataFormatter formatter;

        private int thisColumn = -1;
        // The last column printed to the output stream
        private int lastColumnNumber = -1;

        // Gathers characters as they are seen.
        private StringBuffer value;

        /**
         * Accepts objects needed while parsing.
         *
         * @param styles  Table of styles
         * @param strings Table of shared strings
         * @param cols    Minimum number of columns to show
         * @param target  Sink for output
         */
        public MyXSSFSheetHandler(
                StylesTable styles,
                ReadonlySharedStringsTable strings,
                int cols,
                PrintStream target) {
            this.stylesTable = styles;
            this.sharedStringsTable = strings;
            this.minColumnCount = cols;
            this.output = target;
            this.value = new StringBuffer();
            this.nextDataType = xssfDataType.NUMBER;
            this.formatter = new DataFormatter();
        }

        /*
           * (non-Javadoc)
           * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
           */
        public void startElement(String uri, String localName, String name,
                                 Attributes attributes) throws SAXException {

            if ("inlineStr".equals(name) || "v".equals(name)) {
                vIsOpen = true;
                // Clear contents cache
                value.setLength(0);
            }
            // c => cell
            else if ("c".equals(name)) {
                // Get the cell reference
                String r = attributes.getValue("r");
                int firstDigit = -1;
                for (int c = 0; c < r.length(); ++c) {
                    if (Character.isDigit(r.charAt(c))) {
                        firstDigit = c;
                        break;
                    }
                }
                thisColumn = nameToColumn(r.substring(0, firstDigit));

                // Set up defaults.
                this.nextDataType = xssfDataType.NUMBER;
                this.formatIndex = -1;
                this.formatString = null;
                String cellType = attributes.getValue("t");
                String cellStyleStr = attributes.getValue("s");
                if ("b".equals(cellType))
                    nextDataType = xssfDataType.BOOL;
                else if ("e".equals(cellType))
                    nextDataType = xssfDataType.ERROR;
                else if ("inlineStr".equals(cellType))
                    nextDataType = xssfDataType.INLINESTR;
                else if ("s".equals(cellType))
                    nextDataType = xssfDataType.SSTINDEX;
                else if ("str".equals(cellType))
                    nextDataType = xssfDataType.FORMULA;
                else if (cellStyleStr != null) {
                    /*
                          * It's a number, but possibly has a style and/or special format.
                          * Nick Burch said to use org.apache.poi.ss.usermodel.BuiltinFormats,
                          * and I see javadoc for that at apache.org, but it's not in the
                          * POI 3.5 Beta 5 jars.  Scheduled to appear in 3.5 beta 6.
                          */
                    int styleIndex = Integer.parseInt(cellStyleStr);
                    XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
                    this.formatIndex = style.getDataFormat();
                    this.formatString = style.getDataFormatString();
                    if (this.formatString == null)
                        this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
                }
            }

        }

        /*
           * (non-Javadoc)
           * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
           */
        public void endElement(String uri, String localName, String name)
                throws SAXException {

            String thisStr = null;

            // v => contents of a cell
            if ("v".equals(name)) {
                // Process the value contents as required.
                // Do now, as characters() may be called more than once
                switch (nextDataType) {

                    case BOOL:
                        char first = value.charAt(0);
                        thisStr = first == '0' ? "FALSE" : "TRUE";
                        break;

                    case ERROR:
                        thisStr = "\"ERROR:" + value.toString() + '"';
                        break;

                    case FORMULA:
                        // A formula could result in a string value,
                        // so always add double-quote characters.
                        thisStr = '"' + value.toString() + '"';
                        break;

                    case INLINESTR:
                        // TODO: have seen an example of this, so it's untested.
                        XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
                        thisStr = '"' + rtsi.toString() + '"';
                        break;

                    case SSTINDEX:
                        String sstIndex = value.toString();
                        try {
                            int idx = Integer.parseInt(sstIndex);
                            XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
                            thisStr = '"' + rtss.toString() + '"';
                        }
                        catch (NumberFormatException ex) {
                            output.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString());
                        }
                        break;

                    case NUMBER:
                        String n = value.toString();
                        if (this.formatString != null)
                            thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString);
                        else
                            thisStr = n;
                        break;

                    default:
                        thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
                        break;
                }

                // Output after we've seen the string contents
                // Emit commas for any fields that were missing on this row
                if (lastColumnNumber == -1) {
                    lastColumnNumber = 0;
                }
                for (int i = lastColumnNumber; i < thisColumn; ++i)
                    output.print(',');

                // Might be the empty string.
                output.print(thisStr);

                // Update column
                if (thisColumn > -1)
                    lastColumnNumber = thisColumn;

            } else if ("row".equals(name)) {

                // Print out any missing commas if needed
                if (minColumns > 0) {
                    // Columns are 0 based
                    if (lastColumnNumber == -1) {
                        lastColumnNumber = 0;
                    }
                    for (int i = lastColumnNumber; i < (this.minColumnCount); i++) {
                        output.print(',');
                    }
                }

                // We're onto a new row
                output.println();
                lastColumnNumber = -1;
            }

        }

        /**
         * Captures characters only if a suitable element is open.
         * Originally was just "v"; extended for inlineStr also.
         */
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            if (vIsOpen)
                value.append(ch, start, length);
        }

        /**
         * Converts an Excel column name like "C" to a zero-based index.
         *
         * @param name
         * @return Index corresponding to the specified name
         */
        private int nameToColumn(String name) {
            int column = -1;
            for (int i = 0; i < name.length(); ++i) {
                int c = name.charAt(i);
                column = (column + 1) * 26 + c - 'A';
            }
            return column;
        }

    }

    ///////////////////////////////////////

    private OPCPackage xlsxPackage;
    private int minColumns;
    private PrintStream output;

    /**
     * Creates a new XLSX -> CSV converter
     *
     * @param pkg        The XLSX package to process
     * @param output     The PrintStream to output the CSV to
     * @param minColumns The minimum number of columns to output, or -1 for no minimum
     */
    public XLSX2CSV(OPCPackage pkg, PrintStream output, int minColumns) {
        this.xlsxPackage = pkg;
        this.output = output;
        this.minColumns = minColumns;
    }

    /**
     * Parses and shows the content of one sheet
     * using the specified styles and shared-strings tables.
     *
     * @param styles
     * @param strings
     * @param sheetInputStream
     */
    public void processSheet(
            StylesTable styles,
            ReadonlySharedStringsTable strings,
            InputStream sheetInputStream)
            throws IOException, ParserConfigurationException, SAXException {

        InputSource sheetSource = new InputSource(sheetInputStream);
        SAXParserFactory saxFactory = SAXParserFactory.newInstance();
        SAXParser saxParser = saxFactory.newSAXParser();
        XMLReader sheetParser = saxParser.getXMLReader();
        ContentHandler handler = new MyXSSFSheetHandler(styles, strings, this.minColumns, this.output);
        sheetParser.setContentHandler(handler);
        sheetParser.parse(sheetSource);
    }

    /**
     * Initiates the processing of the XLS workbook file to CSV.
     *
     * @throws IOException
     * @throws OpenXML4JException
     * @throws ParserConfigurationException
     * @throws SAXException
     */
    public void process()
            throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {

        ReadonlySharedStringsTable strings = new ReadonlySharedStringsTable(this.xlsxPackage);
        XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
        StylesTable styles = xssfReader.getStylesTable();
        XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
        int index = 0;
        while (iter.hasNext()) {
            InputStream stream = iter.next();
            String sheetName = iter.getSheetName();
            this.output.println();
            this.output.println(sheetName + " [index=" + index + "]:");
            processSheet(styles, strings, stream);
            stream.close();
            ++index;
        }
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 1) {
            System.err.println("Use:");
            System.err.println("  XLSX2CSV <xlsx file> [min columns]");
            return;
        }

        File xlsxFile = new File(args[0]);
        if (!xlsxFile.exists()) {
            System.err.println("Not found or not a file: " + xlsxFile.getPath());
            return;
        }

        int minColumns = -1;
        if (args.length >= 2)
            minColumns = Integer.parseInt(args[1]);

        // The package open is instantaneous, as it should be.
        OPCPackage p = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
    XLSX2CSV xlsx2csv = new XLSX2CSV(p, System.out, minColumns);
    xlsx2csv.process();
  }

}
TOP

Related Classes of org.apache.poi.xssf.eventusermodel.XLSX2CSV$MyXSSFSheetHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.