pandas使用

使用笔记

  1. 基于某列concat或
    • df = pd.merge(df_raw, df_ret, on="text")
  2. 交集
    • df_join = df1.merge(df2, how="inner", left_on="key1", right_on="key2")
  3. 差集
    • df_diff = df1[~df1["key1"].isin(df2["key2"])]
  4. replace
    • df1.key1.str.replace(r'[^\w\s]+', '').str.upper())
  5. split数据集
      dc_p1_train = dc_p1_sample.sample(frac=train_frac, random_state=RANDOM_STATE)
      dc_p1_dev = dc_p1_sample.drop(dc_p1_train.index)
      dc_p1_test = dc_p1_dev.sample(frac=dev_test_frac, random_state=RANDOM_STATE)
      dc_p1_dev = dc_p1_dev.drop(dc_p1_test.index) 
    
  6. json_normalize展开嵌套列
  • 字典嵌套展开
    data = {
     "id": 1,
     "name": "John",
     "address": {
        "street": "123 Main St",
        "city": "New York",
        "state": "NY",
        "zip": "10001"
     },
     "phone_numbers": [
        {
           "type": "home",
           "number": "555-1234"
        },
        {
           "type": "work",
           "number": "555-5678"
        }
     ]
    }
    df = pd.DataFrame([data])
    address_df = json_normalize(df['address'])
    """
    street      city state    zip
    0  123 Main St  New York    NY  10001
    """
    df = pd.concat([df.drop('address', axis=1), address_df], axis=1)
    
  • 字典列表展开
    import pandas as pd
    from pandas import json_normalize
    
    data = {
       "id": 1,
       "name": "John",
       "address": {
          "street": "123 Main St",
          "city": "New York",
          "state": "NY",
          "zip": "10001"
       },
       "phone_numbers": [
          {
             "type": "home",
             "number": "555-1234"
          },
          {
             "type": "work",
             "number": "555-5678"
          }
       ]
    }
    df = pd.DataFrame([data])
    phone_df = json_normalize(df.to_dict(orient='records'), record_path=['phone_numbers'], meta=['id'], record_prefix='phone_')
    """
    phone_type phone_number id
    0       home     555-1234  1
    1       work     555-5678  1
    """
    df = pd.merge(df.drop('phone_numbers', axis=1), phone_df, on='id')
    
    
posted @ 2023-05-17 15:41  春树&暮云  阅读(11)  评论(0编辑  收藏  举报