pandas,numpy数据处理分析常用方法
pandas,numpy数据处理分析常用方法
有空时将持续补充并完善相关说明
pandas读取各类数据库
pandas读存csv
csv大文件读取(utf-8)
- path:文件路径
- chunksize:文件分区大小(加速读取)
- 返回:pandas dataframe
def read_file(path,chunksize=1000):
mylist = []
for chunk in pd.read_csv(path,
chunksize=chunksize, iterator=True,
index_col=False,
mylist.append(chunk)
temp_df = pd.concat(mylist, axis=0)
df = temp_df
del mylist
return df
pandas存储为csv
df.to_csv('/path/csv_name.csv',mode='a',encoding='utf_8_sig') #文件追加写入
df.to_csv('/path/csv_name.csv',encoding='utf_8_sig') #文件重新写入
pandas读存mysql
pandas读取mysql
def search_sq(db,sql):
connection=pymysql.connect(host="192.168.0.110",user="root",password="123456",charset='utf8')
db_cursor=connection.cursor()
connection.select_db(db)
try:
db_cursor.execute(sql)
ars=db_cursor.fetchall() #获取剩下的所有记录
except Exception as e:
connection.close()
return str(e)
connection.close()
return ars
main_names = usesql.search_sq('fgw',sql)
df = pd.DataFrame(list(main_names),columns=['main_name','region_code','region_name'])
pandas读存hdfs
pandas读取hdfs parquet
hdfs = HDFileSystem(host='192.0.0.1', port=9000)
sc = hdfs.open
pf = ParquetFile('/nari/71000302/71000302004671/2017/test.parquet', open_with=sc)
df = pf.to_pandas()
pandas读存es
批量上传dataframe数据到es
def es_post(df,fid,value,index1='data1',type1='float'):
es = Elasticsearch('192.0.0.1:9200',timeout=600)
try:
mappings = {
"settings":{"max_result_window":"10000000"},
"mappings": {
"properties": {
"value": {
"type": "long",
},
"id": {
"type": "keword",
},
"time": {
"type": "date", # keyword不会进行分词,text会分词
}
}
}
}
res = es.indices.create(index = index1,body =mappings)
print('create')
except Exception as e:
print('already excist')
pass
atime = np.array(df['time'].astype('int64'))
avalue = np.array(df['%s'%value].round(2))
ACTIONS = []
for i in range(len(df.index.values)):
action = {
"_index":"%s"%index1,
"_id":"%s%s"%(int(atime[i].item()),fid),
"_source":{
"value":avalue[i].item(),
"time":atime[i].item(),
"id":"%s"%fid
}
}
ACTIONS.append(action)
res,_ =bulk(es, ACTIONS, index="%s"%index1, raise_on_error=False)
读取es数据为dataframe
def read_es(id1,time1,index='data1',line=43000):
es = Elasticsearch('http://192.168.0.244:9200',timeout=600)
result_dict = es.search(index="%s"%index,
body={
"_source": {
"include": [
"%s"%id1,'time'
]
},
"query": {
"bool": {
"must": [
{"exists": {"field": "%s"%id1}},
{
"range": {
"time": {
"lt": "%s"%time1,"gt": "%s"%(time1-line*1000)
}
}
}
]
}
}
},size=line+1000)
df1 = Select.from_dict(result_dict).to_pandas()
df = DataFrame(df1['time'])
df['%s'%id1] = df1['%s'%id1].astype(float)
del df1
df = df.sort_values(by='time')
return df
pandas读存influxdb
读取influxdb为dataframe
def read_db(id1,time1,table='ain_din',line=86000):
tp = datetime.fromtimestamp(time1/1000)
ts = datetime.fromtimestamp(time1/1000-line)
result = client.query(''' select "%s" from "%s"
WHERE
time >= '%s'
AND
time <= '%s'
'''%(id1,table,ts,tp))
df = dict(result)['%s'%table]
col_name = df.columns.tolist()
col_name.insert(col_name.index('%s'%id1),'time')
df = df.reindex(columns=col_name)
df['time'] = df.index.values
df = change_time(df,'time')
return df
dataframe写入influxdb
def post_db(df,table='7100302'):
df['sys'] = 'nari'
df['time'] = pd.to_datetime(df['time'],unit='s',origin=pd.Timestamp('1970-01-01 08:00:00'))
df.set_index("time", inplace=True)
df = df.sort_values(by='time')
client.write_points(df, '%s'%table, tag_columns=['sys'], protocol='json',batch_size=100000)
dataframe数据处理
dataframe转化指定条件值为np.nan或其它值
df.loc[df["判断列列名"] > 1,"转换列列名"] = np.nan
df.loc[df["判断列列名"] > 1,"转换列列名"] = 0
dataframe中的np.nan值处理
- 删除值为np.nan的行
df = df.dropna(subset=["指定列名"]) #指定判断列
df = df.dropna() #dataframe #全局删除
- 替换np.nan为指定值
df['指定列名'] = df['指定列名'].fillna(0)
df = df.fillna(0)
dataframe数值类型转化
df['指定列名'] = df['指定列名'].astype('int64')
df = df.astype('int64')
dataframe替换指定值为其它值
df = df.replace(['原始值1','原始值2'],[替换值1,替换值2])
df['所选列名'] = df['所选列名'].replace(['原始值1','原始值2'],[替换值1,替换值2])
dataframe合并
df = pd.merge(df1,df2,on=['基准列名']).reset_index(drop=True)
dataframe列名重命名
df.rename(columns={'原始列名1':'修改列名1','原始列名2':'修改列名2'}, inplace = True)
dataframe排序
df = df.sort_values(by='排序列名',ascending=False)
dataframe去重
df = df.drop_duplicates(subset=['指定列名'], keep='first')
数据运算(基础)
dataframe聚合运算
时间聚合
result = df['指定列'].resample('M').sum() #dataframe索引需为时序数据,M表示按月聚合
指定聚合运算
result = df.groupby(['聚合列名'])[['运算列名']].sum()
求移动均值
df['指定列名'] = df['指定列名'].rolling(5).mean()
数据统计
dataframe统计类
-
均值
result = df["指定列名"].mean()
-
最大值
result = df["指定列名"].max()
-
出现次数
result = df["指定列名"].value_counts()