pandas使用

使用笔记

基于某列concat或
- df = pd.merge(df_raw, df_ret, on="text")
交集
- df_join = df1.merge(df2, how="inner", left_on="key1", right_on="key2")
差集
- df_diff = df1[~df1["key1"].isin(df2["key2"])]
replace
- df1.key1.str.replace(r'[^\w\s]+', '').str.upper())

split数据集

  dc_p1_train = dc_p1_sample.sample(frac=train_frac, random_state=RANDOM_STATE)
  dc_p1_dev = dc_p1_sample.drop(dc_p1_train.index)
  dc_p1_test = dc_p1_dev.sample(frac=dev_test_frac, random_state=RANDOM_STATE)
  dc_p1_dev = dc_p1_dev.drop(dc_p1_test.index)

json_normalize展开嵌套列

字典嵌套展开

data = {
 "id": 1,
 "name": "John",
 "address": {
    "street": "123 Main St",
    "city": "New York",
    "state": "NY",
    "zip": "10001"
 },
 "phone_numbers": [
    {
       "type": "home",
       "number": "555-1234"
    },
    {
       "type": "work",
       "number": "555-5678"
    }
 ]
}
df = pd.DataFrame([data])
address_df = json_normalize(df['address'])
"""
street      city state    zip
0  123 Main St  New York    NY  10001
"""
df = pd.concat([df.drop('address', axis=1), address_df], axis=1)

字典列表展开

import pandas as pd
from pandas import json_normalize

data = {
   "id": 1,
   "name": "John",
   "address": {
      "street": "123 Main St",
      "city": "New York",
      "state": "NY",
      "zip": "10001"
   },
   "phone_numbers": [
      {
         "type": "home",
         "number": "555-1234"
      },
      {
         "type": "work",
         "number": "555-5678"
      }
   ]
}
df = pd.DataFrame([data])
phone_df = json_normalize(df.to_dict(orient='records'), record_path=['phone_numbers'], meta=['id'], record_prefix='phone_')
"""
phone_type phone_number id
0       home     555-1234  1
1       work     555-5678  1
"""
df = pd.merge(df.drop('phone_numbers', axis=1), phone_df, on='id')

采样

sampled_df = df.groupby('label').apply(lambda x: x.sample(n=min(len(x), 200))).reset_index(drop=True)

多线程

for sta in tqdm.tqdm(range(0, len(df), batch_size)):
    end = min(sta + batch_size, len(df)) 
    batch = df.iloc[sta:end]
    task_list = []
    cur_res = None
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 通过 functools.partial 传递 requestId 参数
        queries = batch["asr_text"].to_list()
        cur_res = list(executor.map($Function, queries))
    if cur_res:
        # print(f"{end-sta}, {end}, {sta}, cur_res: {len(cur_res)}, {cur_res}")
        assert len(cur_res) == (end - sta), "Invalid result"
        df.loc[sta:end-1, ["domain", "subDomain"]] = cur_res # 注意loc会包含sta和end，iloc不包含end，所以loc赋值需要end-1

posted @ 2023-05-17 15:41 春树&暮云阅读(19) 评论(0) 编辑收藏举报

刷新页面返回顶部

春树&暮云

pandas使用

使用笔记

公告