# %% [markdown]
# Step1 提取出前1轮的关键词
# %%
import pandas as pd
head = pd.read_excel('前1轮相同电器.xlsx')
tail = pd.read_excel('后1轮相同电器.xlsx')
# %%
key1 = head.iloc[:, 2]
# %%
key1
# %%
# 删除字符串中的特定值
import re
list1 = []
for i in key1:
num = re.sub(r'[燃气灶]|[、]', "", i)
list1.append(num)
while '' in list1:
list1.remove('')
list1
# %% [markdown]
# Step2 提取出后1轮的数据(不包含意图和关键词)
# %%
# 获取第4列到最后1列的数据
value1 = tail.iloc[:, 3:tail.shape[1] + 1]
# %%
value1
# %%
# 将dataframe转换为2维数据
import numpy as np
data_array = np.array(value1)
data_array.tolist()
# %%
# 删除空值
list2 = []
for i in data_array:
for j in i:
list2.append(j)
while np.nan in list2:
list2.remove(np.nan)
# %%
list2
# %% [markdown]
# Step3 若句子中含有该关键词,则提取出该关键词
# %%
# 提取出包含关键词的句子
keywords = list1
for i in list2:
if any(j in i for j in keywords):
print(i)