savReaderWriter 模块的使用
作用:
由于python可以辅助数据分析和数据挖掘,读取文件, 而savReaderWriter模块就是为此而设计。
官网 :http://pythonhosted.org/savReaderWriter/
读取文件
with savReaderWriter.SavReader(filepath, ioUtf8=True) as read: # 如果不用ioutf8, 汉字十六进制\被转义,更麻烦 for i in read:
print i
返回值:
# getsavfileinfo infomation :
# (self.numVars, self.nCases, self.varNames, self.varTypes,self.formats, self.varLabels, self.valueLabels)
读取文件头
with savReaderWriter.SavReader(filepath, ioUtf8=True) as read: ret = read.getSavFileInfo() # return (self.numVars, self.nCases, self.varNames, self.varTypes, # self.formats, self.varLabels, self.valueLabels) # return read.formats, read.varNames, read.varLabels, read.valueLabels return ret[4], ret[2], ret[5], ret[6]
生成spss实例 ==注意valueLabels的值的key要是浮点型的
import datetime savFileName = '/opt/someFile.sav' varNames = [u'ID', u'StartTime', u'EndTime', u'VerNo', u'Q1', u'Q2', u'Q4'] varTypes = {u'Q1': 0, u'Q2': 400, u'Q4': 400, u'StartTime': 0, u'VerNo': 0, u'EndTime': 0, u'ID': 20} varLabels = {u'Q1': u'\u5546\u8d85\u914d\u9001\u6536\u8d39\u6807\u51c6\u6b63\u786e\u7684\u662f', u'Q2': u'\u5546\u8d85\u4e0a\u7ebf\u6807\u51c6', u'Q4': u'\u672c\u6b21\u57f9\u8bad\u6536\u83b7\u548c\u610f\u89c1', u'StartTime': u'\u5f00\u59cb\u65f6\u95f4', u'VerNo': u'\u7248\u672c', u'EndTime': u'\u7ed3\u675f\u65f6\u95f4', u'ID': u'\u7528\u6237'} valueLabels = {'Q1': {1.0: u'\u4e13\u9001\u6536\u8d39', 2.0: u'\u5feb\u9001\u6536\u8d39'}, u'Q2': {}, u'Q4': {}, 'StartTime': {}, 'VerNo': {}, 'EndTime': {}, 'ID': {}} formats = {u'Q1': u'F5.0', u'VerNo': u'F5.0', u'EndTime': 'DATETIME40', u'StartTime': 'DATETIME40'} data = [[u'lKWmel1491380676', 13710788676.0, 13710788696.0, 1L, 1, u'\u725b\u820c', u'\u6e56\u516c\u56ed\u80e1\u5a77']] # 时间模块这样是错误的data = [[u'lKWmel1491380676', datetime.datetime(2016, 9, 21, 13, 42, 8), datetime.datetime(2016, 9, 21, 13, 42, 8), 1L, 1, u'\u725b\u820c', u'\u6e56\u516c\u56ed\u80e1\u5a77']] # # with SavWriter(savFileName, varNames, varTypes, varLabels=varLabels, columnWidths={}, ioUtf8=True) as writer: # writer.writerows(data) with SavWriter(savFileName=savFileName, varNames=varNames, varTypes=varTypes, varLabels=varLabels, valueLabels=valueLabels, ioUtf8=True, formats=formats, columnWidths={}) as writer: writer.writerows(data)
错误总结:
1.
针对时间的更改
spss有自己的时间戳,为浮点型,与python的浮点型不一样,有差距,特别要注意
2.
读取文件时候,对文件里面时间改变成字符串类型
with savReaderWriter.SavReader(filepath, ioUtf8=True) as read: # 如果不用ioutf8, 汉字十六进制\被转义,更麻烦 my_time = my_datetime() for i in read: for j in range(len(valuetypes)): # 数据库不认unicode所以要转换下 # 将varchar进行json存如数据库 if valuetypes[j] == "DATETIME":
# 注意区别 ,这个是python2.7使用的,因为python2.7取出来就是字符串,看看是不是unicode,如果是转一下就ok了
# become_time = my_time.become_str(i[j])
# i[j] = become_time
#而这句呢,是3.5的区别,因为取出来是spss的时间戳类型,与python不同,需要转, 具体了解去看源码 i[j] = read.spss2strDate(i[j], '%Y-%m-%d %H:%M:%S', None)
3.
写入的时候对时间的处理
savFileName = '/opt/someFile.sav' with SavWriter(savFileName=savFileName, varNames=varNames, varTypes=varTypes, formats=formats, varLabels=varLabels, valueLabels=valueLabels, ioUtf8=True, columnWidths={}) as writer: for row_data in query_data: sub_li = [] for i in range(len(my_columns_types)): sub_data = row_data[varNames[i]] if my_columns_types[i] == "VARCHAR": sub_li.append(json.loads(sub_data)) elif my_columns_types[i] == "DATETIME": sub_li.append(writer.spssDateTime(b'%s' % sub_data, '%Y-%m-%d %H:%M:%S')) # 这句这句,在源码里面
# 注意: python3的区别:aaa为字符串
#sub_li.append(writer.spssDateTime(bytes(aaa, 'utf-8'), '%Y-%m-%d %H:%M:%S'))
elif my_columns_types[i] == "DATE": sub_li.append(writer.spssDateTime(b'%s' % sub_data, '%Y-%m-%d')) else: sub_li.append(sub_data) data.append(sub_li) writer.writerows(data)
4.
json对字典的处理,2.7与3.5不用,如果存入数据库的话,2.7需要pickle, 而3.5需要json
5
错误总结
通常一下错误的原因是因为头部数据信息和data数据不对称,数据列不对等造成的, 比如可能varname有10列,而数据只有5列, comlns
Traceback (most recent call last): File "/opt/code/test_code/SpssMysql_and_SyntheticSpss/controllers/download_handler.py", line 92, in <module> varLabels=varLabels, ioUtf8=True) as writer: File "/usr/local/lib/python2.7/dist-packages/savReaderWriter/savWriter.py", line 220, in __init__ self.varNamesTypes = self.varNames, self.varTypes File "/usr/local/lib/python2.7/dist-packages/savReaderWriter/header.py", line 200, in varNamesTypes checkErrsWarns(msg, retcode) File "/usr/local/lib/python2.7/dist-packages/savReaderWriter/error.py", line 120, in checkErrsWarns raise SPSSIOError(msg, retcode) savReaderWriter.error.SPSSIOError: Problem setting variable name 'ID' [SPSS_DUP_VAR]
6.
原因是列的名称不符合标准,字母数字下划线才ok
7. 'utf-8' codec can't decode bytes in position 48-49: unexpected end of data
意思是不能解码字节位置48-49:意料之外的数据
为什么, 因为spss数据出现了乱码, 在某一列,例如: spss进行了截取,这个时候就会出现乱码情况
with savReaderWriter.SavReader(filepath) as read:
# 这里的IOutf8 就不能等于True了, 只能用字节的形式, 然后下面针对字符进行处理, 去掉后两位 # 如果不用ioutf8, 汉字十六进制\被转义,更麻烦 dataList = [] # 多条插入数据方式 for i in read: for j in range(len(valuetypes)): # 数据库不认unicode所以要转换下 # 将varchar进行json存如数据库 if valuetypes[j] == "DATETIME": if i[j]: i[j] = read.spss2strDate(i[j], '%Y-%m-%d %H:%M:%S', None) # i[j] = read.spss2strDate(str(i[j], encoding='utf-8'), '%Y-%m-%d %H:%M:%S', None) elif valuetypes[j] == "DATE": if i[j]: i[j] = read.spss2strDate(i[j], '%Y-%m-%d', None) # i[j] = read.spss2strDate(str(i[j], encoding='utf-8'), '%Y-%m-%d', None) elif valuetypes[j] == "VARCHAR" or valuetypes[j] == "TEXT": try: i[j] = i[j].decode("utf-8") except: i[j] = i[j][:-2].decode('utf-8')
作者:沐禹辰
出处:http://www.cnblogs.com/renfanzi/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接。
出处:http://www.cnblogs.com/renfanzi/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接。