java POI 读取大数据量 excel 处理空单元格问题

import lombok.Data;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.ArrayList;
import java.util.List;

/**
 * @author: 
 * @descripition: 快速读取excel
 * @date: created in 15:30 2020/10/13
 * @modify: Copyright (c) Supermap All Rights Reserved.
 */
@Data
public class Excel07Parser extends DefaultHandler {

    //取SST 的索引对应的值
    private SharedStringsTable sst;

    //解析结果保存
    private List<List<String>> container;

    // 开始行
    private Integer startRow=0;

    // 结束行
    private Integer endRow=0;

    // 当前行
    private Integer row;

    // 是否是查询数据
    private Boolean is=false;

    private Excel07Parser.CellDataType nextDataType = Excel07Parser.CellDataType.SSTINDEX;
    private final DataFormatter formatter = new DataFormatter();
    private short formatIndex;
    private String formatString;

    private StylesTable stylesTable;

    //用一个enum表示单元格可能的数据类型
    enum CellDataType{
        BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
    }



    public Excel07Parser(SharedStringsTable sst,StylesTable stylesTable, List<List<String>> container) {
        this.sst = sst;
        this.container = container;
        this.stylesTable = stylesTable;
    }

    public Excel07Parser(SharedStringsTable sst, List<List<String>> container, Integer startRow, Integer endRow, Boolean is) {
        this.sst = sst;
        this.container = container;
        this.startRow = startRow;
        this.endRow = endRow;
        this.is = is;
    }

    /**
     * 存储cell标签下v标签包裹的字符文本内容
     * 在v标签开始后,解析器自动调用characters()保存到 lastContents
     * 【但】当cell标签的属性 s是 t时, 表示取到的lastContents是 SharedStringsTable 的index值
     * 需要在v标签结束时根据 index(lastContents)获取一次真正的值
     */
    private String lastContents;

    //有效数据矩形区域,A1:Y2
    private String dimension;

    //根据dimension得出每行的数据长度
    private int longest;

    //上个有内容的单元格id,判断空单元格
    private String lastCellid;

    //上一行id, 判断空行
    private String lastRowid;

    // 判断单元格cell的c标签下是否有v,否则可能数据错位
    private boolean hasV = false;

    //行数据保存
    private List<String> currentRow;

    //单元格内容是SST 的索引
    private boolean isSSTIndex = false;


    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {

        lastContents = "";
        if (qName.equals("dimension")) {
            dimension = attributes.getValue("ref");
            longest = covertRowIdtoInt(dimension.substring(dimension.indexOf(":") + 1));
        }
        //行开始
        if (qName.equals("row")) {
            String rowNum = attributes.getValue("r");
            row = Integer.parseInt(rowNum);
            //判断空行
            if (lastRowid != null) {
                //与上一行相差2, 说明中间有空行
                int gap = Integer.parseInt(rowNum) - Integer.parseInt(lastRowid);
                if (gap > 1) {
                    gap -= 1;
                    while (gap > 0) {
                        container.add(new ArrayList<>());
                        gap--;
                    }
                }
            }
            lastRowid = attributes.getValue("r");
            currentRow = new ArrayList<>();
        }
        if (qName.equals("c")) {
            // 设置单元格的数据类型
            this.setNextDataType(attributes);

            String rowId = attributes.getValue("r");

            //空单元判断,添加空字符到list
            if (lastCellid != null) {
                int gap = covertRowIdtoInt(rowId) - covertRowIdtoInt(lastCellid);
                for (int i = 0; i < gap - 1; i++) {
                    currentRow.add("");
                }
            } else {
                //第一个单元格可能不是在第一列
                if (!"A1".equals(rowId)) {
                    for (int i = 0; i < covertRowIdtoInt(rowId) - 1; i++) {
                        currentRow.add("");
                    }
                }
            }
            lastCellid = rowId;

            //判断单元格的值是SST 的索引,不能直接characters方法取值
            if (attributes.getValue("t") != null && attributes.getValue("t").equals("s")) {
                isSSTIndex = true;
            } else {
                isSSTIndex = false;
            }
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {

        //行结束,存储一行数据
        if (qName.equals("row")) {

            //判断最后一个单元格是否在最后,补齐列数
            //【注意】有的单元格只修改单元格格式,而没有内容,会出现c标签下没有v标签,导致currentRow少
            if (covertRowIdtoInt(lastCellid) < longest) {
                int min = Math.min(currentRow.size(), covertRowIdtoInt(lastCellid));
                for (int i = 0; i < longest - min; i++) {
                    currentRow.add("");
                }
            }
            if (is){
                if (row==1 || row==2 ||(row > startRow && row < endRow)){
                    container.add(currentRow);
                }
            }else {
                container.add(currentRow);
            }
            lastCellid = null;
        }

        //单元格结束,没有v时需要补位
        if (qName.equals("c")){
            if (!hasV) currentRow.add("");
            hasV = false;
        }

        //单元格内容标签结束,characters方法会被调用处理内容
        if (qName.equals("v")) {
            hasV = true;
            //单元格的值是SST 的索引
            if (isSSTIndex) {
                String sstIndex = lastContents.toString();
                try {
                    int idx = Integer.parseInt(sstIndex);
                    XSSFRichTextString rtss = new XSSFRichTextString(
                            sst.getEntryAt(idx));
                    lastContents = rtss.toString();
                    currentRow.add(lastContents);
                } catch (NumberFormatException ex) {
                    System.out.println(lastContents);
                }
            } else {
                lastContents = this.getDataValue(lastContents.trim(), "");
                currentRow.add(lastContents);
            }

        }

    }


    /**
     * 获取element的文本数据
     *
     * @see org.xml.sax.ContentHandler#characters
     */
    @Override
    public void characters(char[] ch, int start, int length)
            throws SAXException {
        lastContents += new String(ch, start, length);
    }

    /**
     * 列号转数字   AB7-->28 第28列
     *
     * @param cellId 单元格定位id,行列号,AB7
     * @return
     */
    public static int covertRowIdtoInt(String cellId) {
        StringBuilder sb = new StringBuilder();
        String column = "";
        //从cellId中提取列号
        for(char c:cellId.toCharArray()){
            if (Character.isAlphabetic(c)){
                sb.append(c);
            }else{
                column = sb.toString();
            }
        }
        //列号字符转数字
        int result = 0;
        for (char c : column.toCharArray()) {
            result = result * 26 + (c - 'A') + 1;
        }
        return result;
    }

    /**
     * 根据element属性设置数据类型
     * @param attributes
     */
    public void setNextDataType(Attributes attributes){

        nextDataType = Excel07Parser.CellDataType.NUMBER;
        formatIndex = -1;
        formatString = null;
        String cellType = attributes.getValue("t");
        String cellStyleStr = attributes.getValue("s");
        if ("b".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.BOOL;
        }else if ("e".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.ERROR;
        }else if ("inlineStr".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.INLINESTR;
        }else if ("s".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.SSTINDEX;
        }else if ("str".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.FORMULA;
        }
        if (cellStyleStr != null){
            int styleIndex = Integer.parseInt(cellStyleStr);
            XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
            formatIndex = style.getDataFormat();
            formatString = style.getDataFormatString();
            if ("m/d/yy" == formatString){
                nextDataType = Excel07Parser.CellDataType.DATE;
                //full format is "yyyy-MM-dd hh:mm:ss.SSS";
                formatString = "yyyy-MM-dd";
            }
            if (formatString == null){
                nextDataType = Excel07Parser.CellDataType.NULL;
                formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
            }
        }
    }

    /**
     * 根据数据类型获取数据
     * @param value
     * @param thisStr
     * @return
     */
    public String getDataValue(String value, String thisStr)

    {
        switch (nextDataType)
        {
            //这几个的顺序不能随便交换,交换了很可能会导致数据错误
            case BOOL:
                char first = value.charAt(0);
                thisStr = first == '0' ? "FALSE" : "TRUE";
                break;
            case ERROR:
                thisStr = "\"ERROR:" + value.toString() + '"';
                break;
            case FORMULA:
                thisStr = '"' + value.toString() + '"';
                break;
            case INLINESTR:
                XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
                thisStr = rtsi.toString();
                rtsi = null;
                break;
            case SSTINDEX:
                String sstIndex = value.toString();
                thisStr = value.toString();
                break;
            case NUMBER:
                if (formatString != null){
                    thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
                }else{
                    thisStr = value;
                }
                thisStr = thisStr.replace("_", "").trim();
                break;
            case DATE:
                try{
                    thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);
                }catch(NumberFormatException ex){
                    thisStr = value.toString();
                }
                thisStr = thisStr.replace(" ", "");
                break;
            default:
                thisStr = "";
                break;
        }
        return thisStr;
    }

}
如何调用

  

import com.sgis.common.testutils.Excel07Parser;
import org.apache.commons.io.IOUtils;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;


@Test
    public void method_19() throws OpenXML4JException, IOException, SAXException {
        String filePath = "H:\\Project\\test\\测试.xlsx";
        // 读取excel数据
        OPCPackage pkg = OPCPackage.open(filePath);
        XSSFReader r = new XSSFReader(pkg);

        InputStream is = r.getSheet("rId1");
        //debug 查看转换的xml原始文件,方便理解后面解析时的处理,
        byte[] isBytes = IOUtils.toByteArray(is);

        //下面是SST 的索引会用到的
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
        List<List<String>> container = new ArrayList<>();

        StylesTable stylesTable = r.getStylesTable();
        parser.setContentHandler(new Excel07Parser(sst, stylesTable, container));


        InputSource inputSource = new InputSource(new ByteArrayInputStream(isBytes));
        parser.parse(inputSource);

        is.close();

        System.out.println(container);

    }

 

posted @ 2020-11-23 16:35  涂小二  阅读(846)  评论(1编辑  收藏  举报