Table of Contents
问题背景
当数据框中的元素存在列表是,使用data.to_csv,data.to_excel链接都会将其转化成字符串保存。对于后续导入后需要进行二次转换比较棘手。所以也可以考虑通过json和pickle文件进行保存。具体说明如下:
import pandas as pd
import json
import joblib#之前是from sklearn.internels import joblib
#构建数据集
data=pd.DataFrame([{"a":[1,2,3,4],"b":1},{"a":[1,2,1],"b":3}])
data
|
a |
b |
0 |
[1, 2, 3, 4] |
1 |
1 |
[1, 2, 1] |
3 |
数据的保存与读入
to_csv保存
#首先通过to_csv进行保存
data.to_csv(r'test.csv')
#再导入
data1=pd.read_csv(r'test.csv',index_col=0)
data1
|
a |
b |
0 |
[1, 2, 3, 4] |
1 |
1 |
[1, 2, 1] |
3 |
print(data1.loc[0,'a'],type(data1.loc[0,'a']))#默认转换成字符串
[1, 2, 3, 4] <class 'str'>
data1['a']=data1['a'].apply(lambda x:json.loads(x))
print(data1.loc[0,'a'],type(data1.loc[0,'a']))#目标格式
[1, 2, 3, 4] <class 'list'>
to_excel保存
data.to_excel(r'test.xlsx')
#再导入
data2=pd.read_csv(r'test.csv',index_col=0)
data2
|
a |
b |
0 |
[1, 2, 3, 4] |
1 |
1 |
[1, 2, 1] |
3 |
print(data2.loc[0,'a'],type(data2.loc[0,'a']))#默认转换成字符串
[1, 2, 3, 4] <class 'str'>
data2['a']=data2['a'].apply(lambda x:json.loads(x))
print(data2.loc[0,'a'],type(data2.loc[0,'a']))#目标格式
[1, 2, 3, 4] <class 'list'>
josn
#将其保存成json文件
json.dump(data.to_dict(),open(r'test.json','w+'))
若data文件中特殊格式的字符如Timestamps时,可以考虑先转字符串,再保存json.dump(data.astype('str').to_dict(),open(r'test.json','w+'))
#导入
data3=pd.DataFrame(json.load(open(r'test.json','r+')))
data3
|
a |
b |
0 |
[1, 2, 3, 4] |
1 |
1 |
[1, 2, 1] |
3 |
data3.loc[0,'a']
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-14-b9ae983e7860> in <module>
----> 1 data3.loc[0,'a']
d:\software\python\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
1270 def _getitem_tuple(self, tup: Tuple):
1271 try:
-> 1272 return self._getitem_lowerdim(tup)
1273 except IndexingError:
1274 pass
d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
1387 for i, key in enumerate(tup):
1388 if is_label_like(key) or isinstance(key, tuple):
-> 1389 section = self._getitem_axis(key, axis=i)
1390
1391 # we have yielded a scalar ?
d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1962
1963 # fall thru to straight lookup
-> 1964 self._validate_key(key, axis)
1965 return self._get_label(key, axis=axis)
1966
d:\software\python\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
1829
1830 if not is_list_like_indexer(key):
-> 1831 self._convert_scalar_indexer(key, axis)
1832
1833 def _is_scalar_access(self, key: Tuple) -> bool:
d:\software\python\lib\site-packages\pandas\core\indexing.py in _convert_scalar_indexer(self, key, axis)
739 ax = self.obj._get_axis(min(axis, self.ndim - 1))
740 # a scalar
--> 741 return ax._convert_scalar_indexer(key, kind=self.name)
742
743 def _convert_slice_indexer(self, key: slice, axis: int):
d:\software\python\lib\site-packages\pandas\core\indexes\base.py in _convert_scalar_indexer(self, key, kind)
2886 elif kind in ["loc"] and is_integer(key):
2887 if not self.holds_integer():
-> 2888 self._invalid_indexer("label", key)
2889
2890 return key
d:\software\python\lib\site-packages\pandas\core\indexes\base.py in _invalid_indexer(self, form, key)
3075 """
3076 raise TypeError(
-> 3077 f"cannot do {form} indexing on {type(self)} with these "
3078 f"indexers [{key}] of {type(key)}"
3079 )
TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [0] of <class 'int'>
#查看索引名称发现,其变成了字符串
data3.loc['0','a']
[1, 2, 3, 4]
print(data3.loc['0','a'],type(data3.loc['0','a']))#元素为目标格式,但数字索引被转换成了字符串
[1, 2, 3, 4] <class 'list'>
#改进,即修改索引名即可
data3.index=[i for i in range(data3.shape[0])]
print(data3.loc[0,'a'],type(data3.loc[0,'a']))#元素为目标格式,但数字索引被转换成了字符串
[1, 2, 3, 4] <class 'list'>
pickle
data.to_pickle(r'test.pkl')
#导入
data4=pd.read_pickle(r'test.pkl')
data4
|
a |
b |
0 |
[1, 2, 3, 4] |
1 |
1 |
[1, 2, 1] |
3 |
print(data4.loc[0,'a'],type(data4.loc[0,'a']))#直接为原始文件
[1, 2, 3, 4] <class 'list'>
文件储存大小对比<后补>