异步请求、批量处理、持续写入

import grequests
import pandas as pd
import time

all_start = time.time()
sentence_list = []  # 存储相似句子对

df = pd.read_csv("new_clean.csv")
all_sentence_list = list(df["句子"].unique())  # 先去重再转为列表
serving_url = ""
auth_username = ""
auth_password = ""
id = 0
while len(all_sentence_list) > 0:
    current_sentence = all_sentence_list[0]  # 当前句子列表的第一个句子
    other_sentence_len = len(all_sentence_list)  # 目前该列表的长度
    print(id, current_sentence, other_sentence_len)
    batch = 500  # 每次匹配500
    batch_num = other_sentence_len // batch  # 需要匹配的总伦次，30000整除500=60次
    cuttent_sentence_same_list = [
        current_sentence
    ]  # 这个变量是为了装与第一个句子的所有语义相同的句子，包含第一个句子本身
    for num in range(batch_num + 1):  # 从第0轮到第59轮
        start = num * 500 + 1  # i=0时，start=1
        end = (num + 1) * 500  # i=0时，end=500
        if end > other_sentence_len:  # 如果end大于目前循环匹配列表的长度，则取（该列表长度)作为结尾
            end = other_sentence_len
        print(start, end)
        req_list = []
        for i in range(start, end):
            req_list.append(
                grequests.get(
                    serving_url
                    + "/sentence_distance?s1={}&s2={}".format(
                        current_sentence, all_sentence_list[i]
                    ),
                    auth=(auth_username, auth_password),
                    timeout=4,  # 设置每次请求的超时时限，防止堵塞
                )
            )
        start = time.time()
        res_list = grequests.map(req_list)  # 并行发送，等最后一个运行完后返回
        end = time.time()
        print("耗时：{}".format(end - start))
        for res in res_list:
            if not res[0]:
                continue
            try:
                distance = res[0].json()["distance"]
                url = res[1].url
                if distance < 0.6:
                    print(
                        distance,
                        res[1].url,
                    )
                    url = res[1].url
                    sentence2 = url.split("&s2=")[1]  # 句子2
                    cuttent_sentence_same_list.append(sentence2)
            except:
                continue
    for sent in cuttent_sentence_same_list:  # 剔除掉已经相似的句子
        if sent in all_sentence_list:
            all_sentence_list.remove(sent)
    # 遍历一轮后获取到的句子都拼成一行写到txt中去
    file_name = "same_sentence.txt"
    with open(file_name, "a", encoding="utf-8") as f:
        current_same_sentences = "----".join(cuttent_sentence_same_list)
        f.write(current_same_sentences + "\n")
    id += 1

all_end = time.time()

print("总耗时：{}".format(all_end - all_start))
posted @ 2023-03-05 14:46 tiansz 阅读(13) 评论(0) 编辑收藏举报
刷新页面返回顶部