pandas文件保存与读入<补充>

Table of Contents

问题背景

当数据框中的元素存在列表是,使用data.to_csv,data.to_excel链接都会将其转化成字符串保存。对于后续导入后需要进行二次转换比较棘手。所以也可以考虑通过json和pickle文件进行保存。具体说明如下:

import pandas as pd
import json
import joblib#之前是from sklearn.internels import joblib
#构建数据集
data=pd.DataFrame([{"a":[1,2,3,4],"b":1},{"a":[1,2,1],"b":3}])
data
a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3

数据的保存与读入

to_csv保存
#首先通过to_csv进行保存
data.to_csv(r'test.csv')
#再导入
data1=pd.read_csv(r'test.csv',index_col=0)
data1
a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
print(data1.loc[0,'a'],type(data1.loc[0,'a']))#默认转换成字符串
[1, 2, 3, 4] <class 'str'>
data1['a']=data1['a'].apply(lambda x:json.loads(x))
print(data1.loc[0,'a'],type(data1.loc[0,'a']))#目标格式
[1, 2, 3, 4] <class 'list'>
to_excel保存
data.to_excel(r'test.xlsx')
#再导入
data2=pd.read_csv(r'test.csv',index_col=0)
data2
a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
print(data2.loc[0,'a'],type(data2.loc[0,'a']))#默认转换成字符串
[1, 2, 3, 4] <class 'str'>
data2['a']=data2['a'].apply(lambda x:json.loads(x))
print(data2.loc[0,'a'],type(data2.loc[0,'a']))#目标格式
[1, 2, 3, 4] <class 'list'>
josn
#将其保存成json文件
json.dump(data.to_dict(),open(r'test.json','w+'))

若data文件中特殊格式的字符如Timestamps时,可以考虑先转字符串,再保存json.dump(data.astype('str').to_dict(),open(r'test.json','w+'))

#导入
data3=pd.DataFrame(json.load(open(r'test.json','r+')))
data3
a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
data3.loc[0,'a']
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-14-b9ae983e7860> in <module>
----> 1 data3.loc[0,'a']


d:\software\python\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1760                 except (KeyError, IndexError, AttributeError):
   1761                     pass
-> 1762             return self._getitem_tuple(key)
   1763         else:
   1764             # we by definition only have the 0th axis


d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
   1270     def _getitem_tuple(self, tup: Tuple):
   1271         try:
-> 1272             return self._getitem_lowerdim(tup)
   1273         except IndexingError:
   1274             pass


d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
   1387         for i, key in enumerate(tup):
   1388             if is_label_like(key) or isinstance(key, tuple):
-> 1389                 section = self._getitem_axis(key, axis=i)
   1390 
   1391                 # we have yielded a scalar ?


d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1962 
   1963         # fall thru to straight lookup
-> 1964         self._validate_key(key, axis)
   1965         return self._get_label(key, axis=axis)
   1966 


d:\software\python\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
   1829 
   1830         if not is_list_like_indexer(key):
-> 1831             self._convert_scalar_indexer(key, axis)
   1832 
   1833     def _is_scalar_access(self, key: Tuple) -> bool:


d:\software\python\lib\site-packages\pandas\core\indexing.py in _convert_scalar_indexer(self, key, axis)
    739         ax = self.obj._get_axis(min(axis, self.ndim - 1))
    740         # a scalar
--> 741         return ax._convert_scalar_indexer(key, kind=self.name)
    742 
    743     def _convert_slice_indexer(self, key: slice, axis: int):


d:\software\python\lib\site-packages\pandas\core\indexes\base.py in _convert_scalar_indexer(self, key, kind)
   2886             elif kind in ["loc"] and is_integer(key):
   2887                 if not self.holds_integer():
-> 2888                     self._invalid_indexer("label", key)
   2889 
   2890         return key


d:\software\python\lib\site-packages\pandas\core\indexes\base.py in _invalid_indexer(self, form, key)
   3075         """
   3076         raise TypeError(
-> 3077             f"cannot do {form} indexing on {type(self)} with these "
   3078             f"indexers [{key}] of {type(key)}"
   3079         )


TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [0] of <class 'int'>
#查看索引名称发现,其变成了字符串
data3.loc['0','a']
[1, 2, 3, 4]
print(data3.loc['0','a'],type(data3.loc['0','a']))#元素为目标格式,但数字索引被转换成了字符串
[1, 2, 3, 4] <class 'list'>
#改进,即修改索引名即可
data3.index=[i for i in range(data3.shape[0])]
print(data3.loc[0,'a'],type(data3.loc[0,'a']))#元素为目标格式,但数字索引被转换成了字符串
[1, 2, 3, 4] <class 'list'>
pickle
data.to_pickle(r'test.pkl')
#导入
data4=pd.read_pickle(r'test.pkl')
data4
a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
print(data4.loc[0,'a'],type(data4.loc[0,'a']))#直接为原始文件
[1, 2, 3, 4] <class 'list'>

文件储存大小对比<后补>

posted @ 2020-08-31 10:39  LgRun  阅读(443)  评论(0编辑  收藏  举报