pandas使用

使用笔记

  1. 基于某列concat或
    • df = pd.merge(df_raw, df_ret, on="text")
  2. 交集
    • df_join = df1.merge(df2, how="inner", left_on="key1", right_on="key2")
  3. 差集
    • df_diff = df1[~df1["key1"].isin(df2["key2"])]
  4. replace
    • df1.key1.str.replace(r'[^\w\s]+', '').str.upper())
  5. split数据集
      dc_p1_train = dc_p1_sample.sample(frac=train_frac, random_state=RANDOM_STATE)
      dc_p1_dev = dc_p1_sample.drop(dc_p1_train.index)
      dc_p1_test = dc_p1_dev.sample(frac=dev_test_frac, random_state=RANDOM_STATE)
      dc_p1_dev = dc_p1_dev.drop(dc_p1_test.index) 
    
  6. json_normalize展开嵌套列
  • 字典嵌套展开
    data = {
     "id": 1,
     "name": "John",
     "address": {
        "street": "123 Main St",
        "city": "New York",
        "state": "NY",
        "zip": "10001"
     },
     "phone_numbers": [
        {
           "type": "home",
           "number": "555-1234"
        },
        {
           "type": "work",
           "number": "555-5678"
        }
     ]
    }
    df = pd.DataFrame([data])
    address_df = json_normalize(df['address'])
    """
    street      city state    zip
    0  123 Main St  New York    NY  10001
    """
    df = pd.concat([df.drop('address', axis=1), address_df], axis=1)
    
  • 字典列表展开
    import pandas as pd
    from pandas import json_normalize
    
    data = {
       "id": 1,
       "name": "John",
       "address": {
          "street": "123 Main St",
          "city": "New York",
          "state": "NY",
          "zip": "10001"
       },
       "phone_numbers": [
          {
             "type": "home",
             "number": "555-1234"
          },
          {
             "type": "work",
             "number": "555-5678"
          }
       ]
    }
    df = pd.DataFrame([data])
    phone_df = json_normalize(df.to_dict(orient='records'), record_path=['phone_numbers'], meta=['id'], record_prefix='phone_')
    """
    phone_type phone_number id
    0       home     555-1234  1
    1       work     555-5678  1
    """
    df = pd.merge(df.drop('phone_numbers', axis=1), phone_df, on='id')
    
    
  1. 采样
  • sampled_df = df.groupby('label').apply(lambda x: x.sample(n=min(len(x), 200))).reset_index(drop=True)
  1. 多线程
for sta in tqdm.tqdm(range(0, len(df), batch_size)):
    end = min(sta + batch_size, len(df)) 
    batch = df.iloc[sta:end]
    task_list = []
    cur_res = None
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 通过 functools.partial 传递 requestId 参数
        queries = batch["asr_text"].to_list()
        cur_res = list(executor.map($Function, queries))
    if cur_res:
        # print(f"{end-sta}, {end}, {sta}, cur_res: {len(cur_res)}, {cur_res}")
        assert len(cur_res) == (end - sta), "Invalid result"
        df.loc[sta:end-1, ["domain", "subDomain"]] = cur_res # 注意loc会包含sta和end,iloc不包含end,所以loc赋值需要end-1
posted @ 2023-05-17 15:41  春树&暮云  阅读(19)  评论(0编辑  收藏  举报