使用笔记
- 基于某列concat或
- df = pd.merge(df_raw, df_ret, on="text")
- 交集
- df_join = df1.merge(df2, how="inner", left_on="key1", right_on="key2")
- 差集
- df_diff = df1[~df1["key1"].isin(df2["key2"])]
- replace
- df1.key1.str.replace(r'[^\w\s]+', '').str.upper())
- split数据集
dc_p1_train = dc_p1_sample.sample(frac=train_frac, random_state=RANDOM_STATE)
dc_p1_dev = dc_p1_sample.drop(dc_p1_train.index)
dc_p1_test = dc_p1_dev.sample(frac=dev_test_frac, random_state=RANDOM_STATE)
dc_p1_dev = dc_p1_dev.drop(dc_p1_test.index)
- json_normalize展开嵌套列
- 字典嵌套展开
data = {
"id": 1,
"name": "John",
"address": {
"street": "123 Main St",
"city": "New York",
"state": "NY",
"zip": "10001"
},
"phone_numbers": [
{
"type": "home",
"number": "555-1234"
},
{
"type": "work",
"number": "555-5678"
}
]
}
df = pd.DataFrame([data])
address_df = json_normalize(df['address'])
"""
street city state zip
0 123 Main St New York NY 10001
"""
df = pd.concat([df.drop('address', axis=1), address_df], axis=1)
- 字典列表展开
import pandas as pd
from pandas import json_normalize
data = {
"id": 1,
"name": "John",
"address": {
"street": "123 Main St",
"city": "New York",
"state": "NY",
"zip": "10001"
},
"phone_numbers": [
{
"type": "home",
"number": "555-1234"
},
{
"type": "work",
"number": "555-5678"
}
]
}
df = pd.DataFrame([data])
phone_df = json_normalize(df.to_dict(orient='records'), record_path=['phone_numbers'], meta=['id'], record_prefix='phone_')
"""
phone_type phone_number id
0 home 555-1234 1
1 work 555-5678 1
"""
df = pd.merge(df.drop('phone_numbers', axis=1), phone_df, on='id')
- 采样
- sampled_df = df.groupby('label').apply(lambda x: x.sample(n=min(len(x), 200))).reset_index(drop=True)
- 多线程
for sta in tqdm.tqdm(range(0, len(df), batch_size)):
end = min(sta + batch_size, len(df))
batch = df.iloc[sta:end]
task_list = []
cur_res = None
with concurrent.futures.ThreadPoolExecutor() as executor:
# 通过 functools.partial 传递 requestId 参数
queries = batch["asr_text"].to_list()
cur_res = list(executor.map($Function, queries))
if cur_res:
# print(f"{end-sta}, {end}, {sta}, cur_res: {len(cur_res)}, {cur_res}")
assert len(cur_res) == (end - sta), "Invalid result"
df.loc[sta:end-1, ["domain", "subDomain"]] = cur_res # 注意loc会包含sta和end,iloc不包含end,所以loc赋值需要end-1