Java解析excel文件大数据量时,报OOM异常问题。

在解析EXCEL,JAVA 通常使用poi包下的 XSSFWorkbook 对象,但是,遇到海量数据(比如十几万条 或者 JVM堆设置了内存比较小时),就会抛出OOM异常,下面就放出解决方法。

直接上代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
package com.xxx.xxx.xxx.common.util;
 
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
 
/**
 * @author Jimmy Shan
 * @date 2021-04-09
 * @desc 解析大数据量excel文件,避免OOM发生
 */
@Component
public class ExcelParserBigData {
    private static final Logger logger = LoggerFactory.getLogger(ExcelParserBigData.class);
    private ISheetContentHandler contentHandler = new DefaultSheetHandler(); //表格默认处理器
    private List<String[]> datas = new ArrayList<String[]>(); //读取数据
 
    /**
     * @desc 转换表格,默认为转换第一个表格
     */
    public ExcelParserBigData parse(InputStream stream)
            throws InvalidFormatException, IOException, ParseException {
        return parse(stream, 1);
    }
 
    /**
     * @desc 解析方法
     */
    public synchronized ExcelParserBigData parse(InputStream stream, int sheetId)
            throws InvalidFormatException, IOException, ParseException {
        // 每次转换前都清空数据
        datas.clear();
        // 打开表格文件输入流
        OPCPackage pkg = OPCPackage.open(stream);
        try {
            // 创建表阅读器
            XSSFReader reader;
            try {
                reader = new XSSFReader(pkg);
            } catch (OpenXML4JException e) {
                logger.error("读取表格出错");
                throw new ParseException(e.fillInStackTrace());
            }
 
            // 转换指定单元表
            InputStream shellStream = reader.getSheet("rId" + sheetId);
            try {
                InputSource sheetSource = new InputSource(shellStream);
                StylesTable styles = reader.getStylesTable();
                ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
                getContentHandler().init(datas);// 设置读取出的数据
                // 获取转换器
                XMLReader parser = getSheetParser(styles, strings);
                parser.parse(sheetSource);
            } catch (SAXException e) {
                logger.error("读取表格出错");
                throw new ParseException(e.fillInStackTrace());
            } finally {
                shellStream.close();
            }
        } finally {
            pkg.close();
        }
 
        return this;
    }
 
    /**
     * @desc 获取表格读取数据,获取数据前,需要先转换数据,此方法不会获取第一行数据,表格读取数据
     */
    public List<String[]> getDatas() {
        return getDatas(true);
    }
 
    /**
     * @desc 获取表格读取数据,获取数据前,需要先转换数据
     */
    public List<String[]> getDatas(boolean dropFirstRow) {
        if (dropFirstRow && datas.size() > 0) {
            datas.remove(0);// 删除表头
        }
 
        return datas;
    }
 
    /**
     * @desc 获取读取表格的转换器
     */
    protected XMLReader getSheetParser(StylesTable styles, ReadOnlySharedStringsTable strings) throws SAXException {
        XMLReader parser = XMLReaderFactory.createXMLReader();
        parser.setContentHandler(new XSSFSheetXMLHandler(styles, strings, getContentHandler(), false));
 
        return parser;
    }
 
    public ISheetContentHandler getContentHandler() {
        return contentHandler;
    }
 
    public void setContentHandler(ISheetContentHandler contentHandler) {
        this.contentHandler = contentHandler;
    }
 
    /**
     * @desc 表格转换错误
     */
    public class ParseException extends Exception {
        private static final long serialVersionUID = -2451526411018517607L;
 
        public ParseException(Throwable t) {
            super("表格转换错误", t);
        }
    }
 
    public interface ISheetContentHandler extends SheetContentsHandler {
        /**
         * @desc 设置转换后的数据集,用于存放转换结果
         */
        void init(List<String[]> datas);
    }
 
    /**
     * @desc 默认表格解析handder
     */
    class DefaultSheetHandler implements ISheetContentHandler {
        /**
         * @desc 读取数据
         */
        private List<String[]> datas;
        private int columsLength;
        private String[] readRow;
        private ArrayList<String> fristRow = new ArrayList<String>();
 
        @Override
        public void init(List<String[]> datas) {
            this.datas = datas;
            //this.columsLength = columsLength;
        }
 
        @Override
        public void startRow(int rowNum) {
            if (rowNum != 0) {
                readRow = new String[columsLength];
            }
        }
 
        @Override
        public void endRow(int rowNum) {
            //将Excel第一行表头的列数当做数组的长度,要保证后续的行的列数不能超过这个长度,这是个约定。
            if (rowNum == 0) {
                columsLength = fristRow.size();
                readRow = fristRow.toArray(new String[fristRow.size()]);
            }else {
                readRow = fristRow.toArray(new String[columsLength]);
            }
            datas.add(readRow.clone());
            readRow = null;
            fristRow.clear();
        }
 
        @Override
        public void cell(String cellReference, String formattedValue, XSSFComment comment) {
            int index = getCellIndex(cellReference);//转换A1,B1,C1等表格位置为真实索引位置
            try {
                fristRow.set(index, formattedValue);
            } catch (IndexOutOfBoundsException e) {
                int size = fristRow.size();
                for (int i = index - size+1;i>0;i--){
                    fristRow.add(null);
                }
                fristRow.set(index,formattedValue);
            }
        }
 
        @Override
        public void headerFooter(String text, boolean isHeader, String tagName) {
        }
 
        /**
         * @desc 转换表格引用为列编号
         */
        public int getCellIndex(String cellReference) {
            String ref = cellReference.replaceAll("\\d+", "");
            int num = 0;
            int result = 0;
            for (int i = 0; i < ref.length(); i++) {
                char ch = cellReference.charAt(ref.length() - i - 1);
                num = (int) (ch - 'A' + 1);
                num *= Math.pow(26, i);
                result += num;
            }
            return result - 1;
        }
    }
}

 

调用方:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
package com.xxx.xxx.xxx.service.impl;
 
import com.xxx.xxx.xxx.common.util.ExcelParserBigData;
import com.xxx.xxx.xxx.service.DemoBaseService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.FileInputStream;
import java.util.List;
 
/**
 * @author Jimmy Shan
 * @date 2021-04-07
 * @desc 示例 服务实现
 */
@Service("demoBaseService")
public class DemoBaseServiceImpl implements DemoBaseService {
    private static final Logger LOGGER = LoggerFactory.getLogger(DemoBaseServiceImpl.class);
    @Autowired
    private ExcelParserBigData excelParserBigData;
 
    /**
     * @desc 解析大数据量excel文件
     */
    @Override
    public void parseBigXlsx(String upPathName) {
        long start = System.currentTimeMillis();
        FileInputStream inf = null;
        try {
            inf = new FileInputStream(upPathName);
            ExcelParserBigData parseBigData = excelParserBigData.parse(inf);
            List<String[]> dataList = parseBigData.getDatas(); //这里的List泛型是String数组,解析后的数据是以 数组形式存放的。
            for (int i = 0; i < dataList.size(); i++) {
                String[] str = dataList.get(i);
                LOGGER.info("第 {} 行", (i +1));
                LOGGER.info("oppoId = {}, oppoName = {}", str[0], str[1]);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if(inf != null) {
                    inf.close();
                }
                LOGGER.info("解析总耗时:{} 毫秒", (System.currentTimeMillis() - start));
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
}

 

接口定义:

1
2
3
4
5
6
7
8
9
10
11
12
13
package com.xxx.xxx.xxx.service;
 
/**
 * @author Jimmy Shan
 * @date 2021-04-07
 * @desc 示例 服务
 */
public interface DemoBaseService {
    /**
     * @desc 解析大数据量excel文件
     */
    void parseBigXlsx(String upPathName);
}

 

OK,记录到此。

 

posted @   JimmyShan  阅读(2166)  评论(2编辑  收藏  举报
编辑推荐:
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示