pandas,numpy数据处理分析常用方法

pandas,numpy数据处理分析常用方法

maven
有空时将持续补充并完善相关说明


pandas读取各类数据库

pandas读存csv

csv大文件读取(utf-8)

  • path:文件路径
  • chunksize:文件分区大小(加速读取)
  • 返回:pandas dataframe
def read_file(path,chunksize=1000):
    mylist = []
    for chunk in pd.read_csv(path,
                             chunksize=chunksize, iterator=True,
                             index_col=False,
        mylist.append(chunk)
    temp_df = pd.concat(mylist, axis=0)
    df = temp_df
    del mylist
    return df

pandas存储为csv

df.to_csv('/path/csv_name.csv',mode='a',encoding='utf_8_sig') #文件追加写入

df.to_csv('/path/csv_name.csv',encoding='utf_8_sig') #文件重新写入

pandas读存mysql

pandas读取mysql

def search_sq(db,sql):
    connection=pymysql.connect(host="192.168.0.110",user="root",password="123456",charset='utf8')
    db_cursor=connection.cursor()
    connection.select_db(db)
    try:
        db_cursor.execute(sql) 
        ars=db_cursor.fetchall()  #获取剩下的所有记录
        
    except Exception as e:
        connection.close()
        return str(e)
    connection.close()
    return ars
main_names = usesql.search_sq('fgw',sql)
df = pd.DataFrame(list(main_names),columns=['main_name','region_code','region_name'])

pandas读存hdfs

pandas读取hdfs parquet

hdfs = HDFileSystem(host='192.0.0.1', port=9000)
sc = hdfs.open
pf = ParquetFile('/nari/71000302/71000302004671/2017/test.parquet', open_with=sc)
df = pf.to_pandas()

pandas读存es

批量上传dataframe数据到es

def es_post(df,fid,value,index1='data1',type1='float'):
	es = Elasticsearch('192.0.0.1:9200',timeout=600)
	try:
		mappings = {
		"settings":{"max_result_window":"10000000"},
		"mappings": {
		"properties": {
		"value": {
		"type": "long",
		},
		"id": {
		"type": "keword",
		},
		"time": {
		"type": "date",  # keyword不会进行分词,text会分词
		}

		}
		}
		}
		res = es.indices.create(index = index1,body =mappings)
		print('create')
	except Exception as e:
        print('already excist')
		pass
	atime = np.array(df['time'].astype('int64'))
	avalue = np.array(df['%s'%value].round(2))
	ACTIONS = []
	for i in range(len(df.index.values)):
		action = {
		"_index":"%s"%index1,
		"_id":"%s%s"%(int(atime[i].item()),fid),
		"_source":{
		"value":avalue[i].item(),
		"time":atime[i].item(),
		"id":"%s"%fid
		}
		}
		ACTIONS.append(action)
	res,_ =bulk(es, ACTIONS, index="%s"%index1, raise_on_error=False)

读取es数据为dataframe

def read_es(id1,time1,index='data1',line=43000):
    es = Elasticsearch('http://192.168.0.244:9200',timeout=600)
    result_dict = es.search(index="%s"%index, 
    body={
        "_source": {
            "include": [
                        "%s"%id1,'time'
                        ]
            },
         "query": {
                "bool": {
                "must": [
                {"exists": {"field": "%s"%id1}},
                {
                "range": {
                "time": {
               "lt": "%s"%time1,"gt": "%s"%(time1-line*1000)
                }
                }
                }
                ]
                }
                }


         },size=line+1000)
    df1 = Select.from_dict(result_dict).to_pandas()
    df = DataFrame(df1['time'])
    df['%s'%id1] = df1['%s'%id1].astype(float)
    del df1
    df = df.sort_values(by='time')
    return df

pandas读存influxdb

读取influxdb为dataframe

def read_db(id1,time1,table='ain_din',line=86000):
	tp = datetime.fromtimestamp(time1/1000)
	ts = datetime.fromtimestamp(time1/1000-line)
	result = client.query(''' select "%s" from "%s" 
WHERE
time >= '%s' 
AND
time <= '%s' 
'''%(id1,table,ts,tp))
	df = dict(result)['%s'%table]
	col_name = df.columns.tolist()
	col_name.insert(col_name.index('%s'%id1),'time')
	df = df.reindex(columns=col_name)
	df['time'] = df.index.values
	df = change_time(df,'time')
	return df

dataframe写入influxdb

def post_db(df,table='7100302'):
	df['sys'] = 'nari'
	df['time'] = pd.to_datetime(df['time'],unit='s',origin=pd.Timestamp('1970-01-01 08:00:00'))
	df.set_index("time", inplace=True)
	df = df.sort_values(by='time')
	client.write_points(df, '%s'%table, tag_columns=['sys'], protocol='json',batch_size=100000)

dataframe数据处理

dataframe转化指定条件值为np.nan或其它值

df.loc[df["判断列列名"] > 1,"转换列列名"] = np.nan

df.loc[df["判断列列名"] > 1,"转换列列名"] = 0

dataframe中的np.nan值处理

  • 删除值为np.nan的行

df = df.dropna(subset=["指定列名"]) #指定判断列

df = df.dropna() #dataframe #全局删除

  • 替换np.nan为指定值

df['指定列名'] = df['指定列名'].fillna(0)

df = df.fillna(0)

dataframe数值类型转化

df['指定列名'] = df['指定列名'].astype('int64')

df = df.astype('int64')

dataframe替换指定值为其它值

df = df.replace(['原始值1','原始值2'],[替换值1,替换值2])

df['所选列名'] = df['所选列名'].replace(['原始值1','原始值2'],[替换值1,替换值2])

dataframe合并

df = pd.merge(df1,df2,on=['基准列名']).reset_index(drop=True)

dataframe列名重命名

df.rename(columns={'原始列名1':'修改列名1','原始列名2':'修改列名2'}, inplace = True)

dataframe排序

df = df.sort_values(by='排序列名',ascending=False)

dataframe去重

df = df.drop_duplicates(subset=['指定列名'], keep='first')

数据运算(基础)

dataframe聚合运算

时间聚合

result = df['指定列'].resample('M').sum() #dataframe索引需为时序数据,M表示按月聚合

指定聚合运算

result = df.groupby(['聚合列名'])[['运算列名']].sum()

求移动均值

df['指定列名'] = df['指定列名'].rolling(5).mean()

数据统计

dataframe统计类

  • 均值
    result = df["指定列名"].mean()

  • 最大值
    result = df["指定列名"].max()

  • 出现次数
    result = df["指定列名"].value_counts()

数据运算(进阶)

posted @ 2020-11-25 17:22  我不做人了jojo  阅读(313)  评论(0编辑  收藏  举报