java POI 读取大数据量 excel 处理空单元格问题
import lombok.Data; import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import java.util.ArrayList; import java.util.List; /** * @author: * @descripition: 快速读取excel * @date: created in 15:30 2020/10/13 * @modify: Copyright (c) Supermap All Rights Reserved. */ @Data public class Excel07Parser extends DefaultHandler { //取SST 的索引对应的值 private SharedStringsTable sst; //解析结果保存 private List<List<String>> container; // 开始行 private Integer startRow=0; // 结束行 private Integer endRow=0; // 当前行 private Integer row; // 是否是查询数据 private Boolean is=false; private Excel07Parser.CellDataType nextDataType = Excel07Parser.CellDataType.SSTINDEX; private final DataFormatter formatter = new DataFormatter(); private short formatIndex; private String formatString; private StylesTable stylesTable; //用一个enum表示单元格可能的数据类型 enum CellDataType{ BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL } public Excel07Parser(SharedStringsTable sst,StylesTable stylesTable, List<List<String>> container) { this.sst = sst; this.container = container; this.stylesTable = stylesTable; } public Excel07Parser(SharedStringsTable sst, List<List<String>> container, Integer startRow, Integer endRow, Boolean is) { this.sst = sst; this.container = container; this.startRow = startRow; this.endRow = endRow; this.is = is; } /** * 存储cell标签下v标签包裹的字符文本内容 * 在v标签开始后,解析器自动调用characters()保存到 lastContents * 【但】当cell标签的属性 s是 t时, 表示取到的lastContents是 SharedStringsTable 的index值 * 需要在v标签结束时根据 index(lastContents)获取一次真正的值 */ private String lastContents; //有效数据矩形区域,A1:Y2 private String dimension; //根据dimension得出每行的数据长度 private int longest; //上个有内容的单元格id,判断空单元格 private String lastCellid; //上一行id, 判断空行 private String lastRowid; // 判断单元格cell的c标签下是否有v,否则可能数据错位 private boolean hasV = false; //行数据保存 private List<String> currentRow; //单元格内容是SST 的索引 private boolean isSSTIndex = false; @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { lastContents = ""; if (qName.equals("dimension")) { dimension = attributes.getValue("ref"); longest = covertRowIdtoInt(dimension.substring(dimension.indexOf(":") + 1)); } //行开始 if (qName.equals("row")) { String rowNum = attributes.getValue("r"); row = Integer.parseInt(rowNum); //判断空行 if (lastRowid != null) { //与上一行相差2, 说明中间有空行 int gap = Integer.parseInt(rowNum) - Integer.parseInt(lastRowid); if (gap > 1) { gap -= 1; while (gap > 0) { container.add(new ArrayList<>()); gap--; } } } lastRowid = attributes.getValue("r"); currentRow = new ArrayList<>(); } if (qName.equals("c")) { // 设置单元格的数据类型 this.setNextDataType(attributes); String rowId = attributes.getValue("r"); //空单元判断,添加空字符到list if (lastCellid != null) { int gap = covertRowIdtoInt(rowId) - covertRowIdtoInt(lastCellid); for (int i = 0; i < gap - 1; i++) { currentRow.add(""); } } else { //第一个单元格可能不是在第一列 if (!"A1".equals(rowId)) { for (int i = 0; i < covertRowIdtoInt(rowId) - 1; i++) { currentRow.add(""); } } } lastCellid = rowId; //判断单元格的值是SST 的索引,不能直接characters方法取值 if (attributes.getValue("t") != null && attributes.getValue("t").equals("s")) { isSSTIndex = true; } else { isSSTIndex = false; } } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { //行结束,存储一行数据 if (qName.equals("row")) { //判断最后一个单元格是否在最后,补齐列数 //【注意】有的单元格只修改单元格格式,而没有内容,会出现c标签下没有v标签,导致currentRow少 if (covertRowIdtoInt(lastCellid) < longest) { int min = Math.min(currentRow.size(), covertRowIdtoInt(lastCellid)); for (int i = 0; i < longest - min; i++) { currentRow.add(""); } } if (is){ if (row==1 || row==2 ||(row > startRow && row < endRow)){ container.add(currentRow); } }else { container.add(currentRow); } lastCellid = null; } //单元格结束,没有v时需要补位 if (qName.equals("c")){ if (!hasV) currentRow.add(""); hasV = false; } //单元格内容标签结束,characters方法会被调用处理内容 if (qName.equals("v")) { hasV = true; //单元格的值是SST 的索引 if (isSSTIndex) { String sstIndex = lastContents.toString(); try { int idx = Integer.parseInt(sstIndex); XSSFRichTextString rtss = new XSSFRichTextString( sst.getEntryAt(idx)); lastContents = rtss.toString(); currentRow.add(lastContents); } catch (NumberFormatException ex) { System.out.println(lastContents); } } else { lastContents = this.getDataValue(lastContents.trim(), ""); currentRow.add(lastContents); } } } /** * 获取element的文本数据 * * @see org.xml.sax.ContentHandler#characters */ @Override public void characters(char[] ch, int start, int length) throws SAXException { lastContents += new String(ch, start, length); } /** * 列号转数字 AB7-->28 第28列 * * @param cellId 单元格定位id,行列号,AB7 * @return */ public static int covertRowIdtoInt(String cellId) { StringBuilder sb = new StringBuilder(); String column = ""; //从cellId中提取列号 for(char c:cellId.toCharArray()){ if (Character.isAlphabetic(c)){ sb.append(c); }else{ column = sb.toString(); } } //列号字符转数字 int result = 0; for (char c : column.toCharArray()) { result = result * 26 + (c - 'A') + 1; } return result; } /** * 根据element属性设置数据类型 * @param attributes */ public void setNextDataType(Attributes attributes){ nextDataType = Excel07Parser.CellDataType.NUMBER; formatIndex = -1; formatString = null; String cellType = attributes.getValue("t"); String cellStyleStr = attributes.getValue("s"); if ("b".equals(cellType)){ nextDataType = Excel07Parser.CellDataType.BOOL; }else if ("e".equals(cellType)){ nextDataType = Excel07Parser.CellDataType.ERROR; }else if ("inlineStr".equals(cellType)){ nextDataType = Excel07Parser.CellDataType.INLINESTR; }else if ("s".equals(cellType)){ nextDataType = Excel07Parser.CellDataType.SSTINDEX; }else if ("str".equals(cellType)){ nextDataType = Excel07Parser.CellDataType.FORMULA; } if (cellStyleStr != null){ int styleIndex = Integer.parseInt(cellStyleStr); XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); formatIndex = style.getDataFormat(); formatString = style.getDataFormatString(); if ("m/d/yy" == formatString){ nextDataType = Excel07Parser.CellDataType.DATE; //full format is "yyyy-MM-dd hh:mm:ss.SSS"; formatString = "yyyy-MM-dd"; } if (formatString == null){ nextDataType = Excel07Parser.CellDataType.NULL; formatString = BuiltinFormats.getBuiltinFormat(formatIndex); } } } /** * 根据数据类型获取数据 * @param value * @param thisStr * @return */ public String getDataValue(String value, String thisStr) { switch (nextDataType) { //这几个的顺序不能随便交换,交换了很可能会导致数据错误 case BOOL: char first = value.charAt(0); thisStr = first == '0' ? "FALSE" : "TRUE"; break; case ERROR: thisStr = "\"ERROR:" + value.toString() + '"'; break; case FORMULA: thisStr = '"' + value.toString() + '"'; break; case INLINESTR: XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); thisStr = rtsi.toString(); rtsi = null; break; case SSTINDEX: String sstIndex = value.toString(); thisStr = value.toString(); break; case NUMBER: if (formatString != null){ thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim(); }else{ thisStr = value; } thisStr = thisStr.replace("_", "").trim(); break; case DATE: try{ thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString); }catch(NumberFormatException ex){ thisStr = value.toString(); } thisStr = thisStr.replace(" ", ""); break; default: thisStr = ""; break; } return thisStr; } }
如何调用
import com.sgis.common.testutils.Excel07Parser; import org.apache.commons.io.IOUtils; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.model.StylesTable; import org.junit.Test; import org.junit.runner.RunWith; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.junit4.SpringRunner; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; @Test public void method_19() throws OpenXML4JException, IOException, SAXException { String filePath = "H:\\Project\\test\\测试.xlsx"; // 读取excel数据 OPCPackage pkg = OPCPackage.open(filePath); XSSFReader r = new XSSFReader(pkg); InputStream is = r.getSheet("rId1"); //debug 查看转换的xml原始文件,方便理解后面解析时的处理, byte[] isBytes = IOUtils.toByteArray(is); //下面是SST 的索引会用到的 SharedStringsTable sst = r.getSharedStringsTable(); XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); List<List<String>> container = new ArrayList<>(); StylesTable stylesTable = r.getStylesTable(); parser.setContentHandler(new Excel07Parser(sst, stylesTable, container)); InputSource inputSource = new InputSource(new ByteArrayInputStream(isBytes)); parser.parse(inputSource); is.close(); System.out.println(container); }