部分常用代码整理

获取 top-K 的检索数据
def
i2t(images, sims, npts=None, return_ranks=False): """ Images->Text (Image Annotation) Images: (N, n_region, d) matrix of images Captions: (5N, max_n_word, d) matrix of captions CapLens: (5N) array of caption lengths sims: (N, 5N) matrix of similarity im-cap """ npts = images.shape[0] ranks = np.zeros(npts) top1 = np.zeros(npts) results = np.zeros((5000, 10), dtype='int') for index in range(npts): inds = np.argsort(sims[index])[::-1] # Score rank = 1e20 for i in range(5 * index, 5 * index + 5, 1): tmp = np.where(inds == i)[0][0] if tmp < rank: rank = tmp ranks[index] = rank top1[index] = inds[0]
# ranking前10的文本 results[index]
= inds[0:10] # ranks 表示正确的排序位置,0表示top-1 np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/i2t_results.csv", results) np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/i2t_right_or_wrong.csv", ranks) # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 meanr = ranks.mean() + 1 if return_ranks: return (r1, r5, r10, medr, meanr), (ranks, top1) else: return (r1, r5, r10, medr, meanr) def t2i(images, sims, npts=None, return_ranks=False): """ Text->Images (Image Search) Images: (N, n_region, d) matrix of images Captions: (5N, max_n_word, d) matrix of captions CapLens: (5N) array of caption lengths sims: (N, 5N) matrix of similarity im-cap """ npts = images.shape[0] ranks = np.zeros(5 * npts) top1 = np.zeros(5 * npts) # --> (5N(caption), N(image)) sims = sims.T results = np.zeros((5000, 10), dtype='int') for index in range(npts): for i in range(5): inds = np.argsort(sims[5 * index + i])[::-1] ranks[5 * index + i] = np.where(inds == index)[0][0] top1[5 * index + i] = inds[0] results[5 * index + i] = inds[0:10] np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/t2i_results.csv", results) np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/t2i_right_or_wrong.csv", ranks) # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 meanr = ranks.mean() + 1 if return_ranks: return (r1, r5, r10, medr, meanr), (ranks, top1) else: return (r1, r5, r10, medr, meanr)
获取 attention map, bbox


import
cv2 import matplotlib.pyplot as plt import numpy as np import pandas as pd from tqdm import tqdm from PIL import Image # 输入图片的路径信息 ima_path,bounding boxes信息,处理后的attention值(36个值,分配到每个region上的注意力值),图片的命名信息 def region_attention_visualization(img_path, boxes, box_attentions, i): # print("load image from: ", img_path) # load the image img = Image.open(img_path, mode='r') # draw bounding box with attention img = np.ascontiguousarray(img) for box, attention in zip(boxes, box_attentions): zeros1 = np.zeros((img.shape), dtype=np.uint8) # box = cv2.rectangle(zeros1, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255, 255, 255), -1) box = cv2.rectangle(zeros1, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255, 255, 255), -1) img = cv2.addWeighted(img, 1, box, attention, 0) name = "/mnt/data2/zk/vis/%d.jpg" % i cv2.imwrite(name, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) ## 图像路径,bounding boxe信息 def region_visualization(img_path, bboxes): print("load image from: ", img_path) for i in range(36): # load the image img = Image.open(img_path, mode='r') # draw bounding box with attention img = np.ascontiguousarray(img) p1 = (int(bboxes[i, 0]), int(bboxes[i, 1])) p2 = (int(bboxes[i, 2]), int(bboxes[i, 3])) print(p1,p2) cv2.rectangle(img, p1, p2, (0, 0, 255), 2) font = cv2.FONT_HERSHEY_SIMPLEX # 定义字体 imgzi = cv2.putText(img, '{}'.format(i), p1, font, 0.5, (255, 0, 0), 1) name = "/mnt/data2/zk/vis/bbx/%d.jpg" % i cv2.imwrite(name, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) if __name__ == "__main__": flickrid_list = [] f = open("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ids.txt", "r") lines = f.readlines() for line in lines: flickrid_list.append(line.split()[0]) df = pd.read_csv("/mnt/data10t/bakuphome20210617/lz/data/I-T/filename2flickrid.csv") total_step = len(flickrid_list) ##############--------------------------------------------------------------------------------------------------- # # print all image-text pairs # for i, imgid in tqdm(enumerate(flickrid_list), total=total_step, ncols=80): # a = df.loc[df['imgid'] == int(imgid)] # b = a.iloc[0, 0] # img_dir = "/mnt/data10t/bakuphome20210617/lz/data/I-T/flickr30k-images/" + str(b) # bbox_info = np.load("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ims_bbx.npy") # bboxes = bbox_info[i] # conf_path = "/mnt/data10t/bakuphome20210617/lz/Dim_mask_5/attn_conf/finally/" + str(i) + ".npy" # conf = np.load(conf_path) # conf = (conf - np.min(conf)) / (np.max(conf) - np.min(conf)) # min-max scale # idxs = conf.argsort()[::-1][0:15] # extract confidence top-6 regions # for j in range(36): # if j in idxs: # pass # else: # conf[j] = 0. # conf_sum = np.sum(conf) # conf = (conf / conf_sum) * 5 # region_attention_visualization(img_dir, bboxes, conf, i) ##############--------------------------------------------------------------------------------------------------- # ####print selsected image-text pairs # 第几个文本(或组成的 第几个image-text pair) i = 53 imgid = '219' a = df.loc[df['imgid'] == int(imgid)] b = a.iloc[0, 0] img_dir = "/mnt/data10t/bakuphome20210617/lz/data/I-T/flickr30k-images/" + str(b) bbox_info = np.load("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ims_bbx.npy") bboxes = bbox_info[i] # conf_path = "/mnt/data10t/bakuphome20210617/lz/Dim_mask_5/attn_conf/finally/" + str(i) + ".npy" conf_path = "/mnt/data10t/bakuphome20210617/lz/neg_2021_9_11/attn_conf/" + str(i) + ".npy" conf = np.load(conf_path) conf = (conf - np.min(conf)) / (np.max(conf) - np.min(conf)) # min-max scale idxs = conf.argsort()[::-1][0:15] # extract confidence top-6 regions for j in range(36): if j in idxs: pass else: conf[j] = 0. conf_sum = np.sum(conf) conf = (conf / conf_sum) * 1.5 region_attention_visualization(img_dir, bboxes, conf, i) # ##############--------------------------------------------------------------------------------------------------- # # 第几个文本(或组成的 第几个image-text pair) # i = 53 # imgid = '219' # a = df.loc[df['imgid'] == int(imgid)] # b = a.iloc[0, 0] # img_dir = "/mnt/data10t/bakuphome20210617/lz/data/I-T/flickr30k-images/" + str(b) # bbox_info = np.load("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ims_bbx.npy") # bboxes = bbox_info[i] # region_visualization(img_dir, bboxes)

数据集JSON文件处理:根据数据集划分的ID,重新组织image file name和对应的captions

import numpy as np
import pandas as pd
import ujson as json
from modules.basic_utils import load_json


caption_train_2014 = '/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/annotations/captions_train2014.json'
caption_val_2014 = '/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/annotations/captions_val2014.json'
caption_train = load_json(caption_train_2014)
caption_val = load_json(caption_val_2014)

testall_image_id = np.loadtxt('/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/coco_precomp/testall_ids.txt', dtype=int)
train_image_id = np.loadtxt('/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/coco_precomp/train_ids.txt', dtype=int)
dev_image_id = np.loadtxt('/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/coco_precomp/dev_ids.txt', dtype=int)

print(len(testall_image_id))
print(len(train_image_id))
print(len(dev_image_id))

######################################################################################################################
train_json = {'images':[]}

for i in range(len(train_image_id)):
    image_id = train_image_id[i]
    split = 'train'
    file_name = ''
    caption = []
    for j in range(len(caption_train['images'])):
        if image_id == caption_train['images'][j]['id']:
            file_name = caption_train['images'][j]['file_name']

            for k in range(len(caption_train['annotations'])):
                if image_id == caption_train['annotations'][k]['image_id']:
                    caption.append(caption_train['annotations'][k]['caption'])
            break


    
    if file_name == '':
        for j in range(len(caption_val['images'])):
            if image_id == caption_val['images'][j]['id']:
                file_name = caption_val['images'][j]['file_name']

                for k in range(len(caption_val['annotations'])):
                    if image_id == caption_val['annotations'][k]['image_id']:
                        caption.append(caption_val['annotations'][k]['caption'])
                break

    if len(caption) != 5:
        print('error!')
    data = {'imageid':str(image_id), 'split':split, 'file_name':file_name, 
    'sentences':[{'raw':caption[0]}, {'raw':caption[1]}, {'raw':caption[2]}, {'raw':caption[3]}, {'raw':caption[4]}]}

    train_json['images'].append(data)

train_json = json.dumps(train_json)
f = open('/mnt/data2/zk/train_coco.json', 'w')
f.write(train_json)
f.close()


######################################################################################################################
# dev_json = {'images':[]}

# for i in range(len(dev_image_id)//5):
#     image_id = dev_image_id[i*5]
#     split = 'dev'
#     file_name = ''
#     caption = []


#     for j in range(len(caption_val['images'])):
#         if image_id == caption_val['images'][j]['id']:
#             file_name = caption_val['images'][j]['file_name']

#             for k in range(len(caption_val['annotations'])):
#                 if image_id == caption_val['annotations'][k]['image_id']:
#                     caption.append(caption_val['annotations'][k]['caption'])
#             break

#     if len(caption) >= 5:
#         data = {'imageid':str(image_id), 'split':split, 'file_name':file_name, 
#         'sentences':[{'raw':caption[0]}, {'raw':caption[1]}, {'raw':caption[2]}, {'raw':caption[3]}, {'raw':caption[4]}]}
#         dev_json['images'].append(data)
#     else:
#         print("error!")

# train_json = json.dumps(dev_json)
# f = open('/mnt/data2/zk/dev_coco.json', 'w')
# f.write(train_json)
# f.close()

# for i in range(len(testall_json['images'])):
#     testall_json['images'][i]['split'] = 'testall'



######################################################################################################################
# testall_json = {'images':[]}

# for i in range(len(testall_image_id)//5):
#     image_id = testall_image_id[i*5]
#     split = 'testall'
#     file_name = ''
#     caption = []


#     for j in range(len(caption_val['images'])):
#         if image_id == caption_val['images'][j]['id']:
#             file_name = caption_val['images'][j]['file_name']

#             for k in range(len(caption_val['annotations'])):
#                 if image_id == caption_val['annotations'][k]['image_id']:
#                     caption.append(caption_val['annotations'][k]['caption'])
#             break

#     if len(caption) >= 5:
#         data = {'imageid':str(image_id), 'split':split, 'file_name':file_name, 
#         'sentences':[{'raw':caption[0]}, {'raw':caption[1]}, {'raw':caption[2]}, {'raw':caption[3]}, {'raw':caption[4]}]}
#         testall_json['images'].append(data)
#     else:
#         print("error!")

# testall_json = json.dumps(testall_json)
# f = open('/mnt/data2/zk/testall_coco.json', 'w')
# f.write(testall_json)
# f.close()

# # for i in range(len(testall_json['images'])):
# #     testall_json['images'][i]['split'] = 'testall'

Ensemble代码:输入为对应的similarity matrix

# -------------------------------------------------------------------------------------
# Negative-Aware Attention Framework for Image-Text Matching  implementation based on SCAN
# https:.
# "Negative-Aware Attention Framework for Image-Text Matching"
# Kun Zhang, Zhendong Mao, Quan Wang, Yongdong Zhang
#
# Writen by Kun Zhang, 2022
# -------------------------------------------------------------------------------------
# from vocab import Vocabulary
# import evaluation
import numpy as np
import os


def i2t(im_len, sims, npts=None, return_ranks=False):
    """
    Images->Text (Image Annotation)
    Images: (N, n_region, d) matrix of images
    Captions: (5N, max_n_word, d) matrix of captions
    CapLens: (5N) array of caption lengths
    sims: (N, 5N) matrix of similarity im-cap
    """
    npts = im_len
    ranks = np.zeros(npts)
    top1 = np.zeros(npts)
    for index in range(npts):
        inds = np.argsort(sims[index])[::-1]
        # Score
        rank = 1e20
        for i in range(5 * index, 5 * index + 5, 1):
            tmp = np.where(inds == i)[0][0]
            if tmp < rank:
                rank = tmp
        ranks[index] = rank
        top1[index] = inds[0]

    # Compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1
    if return_ranks:
        return (r1, r5, r10, medr, meanr), (ranks, top1)
    else:
        return (r1, r5, r10, medr, meanr)


def t2i(im_len, sims, npts=None, return_ranks=False):
    """
    Text->Images (Image Search)
    Images: (N, n_region, d) matrix of images
    Captions: (5N, max_n_word, d) matrix of captions
    CapLens: (5N) array of caption lengths
    sims: (N, 5N) matrix of similarity im-cap
    """
    npts = im_len
    ranks = np.zeros(5 * npts)
    top1 = np.zeros(5 * npts)

    # --> (5N(caption), N(image))
    sims = sims.T

    for index in range(npts):
        for i in range(5):
            inds = np.argsort(sims[5 * index + i])[::-1]
            ranks[5 * index + i] = np.where(inds == index)[0][0]
            top1[5 * index + i] = inds[0]

    # Compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1
    if return_ranks:
        return (r1, r5, r10, medr, meanr), (ranks, top1)
    else:
        return (r1, r5, r10, medr, meanr)



if __name__ == '__main__':
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    isfold5 = True

    if not isfold5:

        # ## Flickr30K
        # Path_of_Model_1 = '/mnt/data2/zk/ESL_bert/checkpoint2/Flickr30K_ESL_MODELS/sim_best_flickr_521.7_learnable.txt'
        # Path_of_Model_2 = '/mnt/data2/zk/ESL_bert/checkpoint2/Flickr30K_ESL_MODELS/sim_best_flickr_522.2.txt'

        ## MS-COCO
        Path_of_Model_1 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-LEARNABLE/sim_best_447.0_coco_5k.txt'
        Path_of_Model_2 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-NON-LEARNABLE/sim_best_coco_446.9_non_learnable.txt'

        sims1 = np.loadtxt(Path_of_Model_1)
        sims2 = np.loadtxt(Path_of_Model_2)

        sims = (sims1 + sims2) 
        im_len = len(sims)
        print('im length:', im_len)
        r, rt = i2t(im_len, sims, return_ranks=True)
        ri, rti = t2i(im_len, sims, return_ranks=True)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f" % rsum)
        print("Average i2t Recall: %.1f" % ar)
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
        print("Average t2i Recall: %.1f" % ari)
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
    else:
        results = []
        for i in range(5):

            Path_of_Model_1 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-LEARNABLE/'
            Path_of_Model_2 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-NON-LEARNABLE/'

            sims1 = np.loadtxt(Path_of_Model_1 + str(i) + 'sim_best.txt')
            sims2 = np.loadtxt(Path_of_Model_2 + str(i) + 'sim_best.txt')

            sim_shard = (sims1 + sims2) / 2
            im_len = len(sim_shard)
            print('im length:', im_len)
            r, rt0 = i2t(im_len, sim_shard, return_ranks=True)
            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
            ri, rti0 = t2i(im_len, sim_shard, return_ranks=True)
            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)

            if i == 0:
                rt, rti = rt0, rti0
            ar = (r[0] + r[1] + r[2]) / 3
            ari = (ri[0] + ri[1] + ri[2]) / 3
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
            results += [list(r) + list(ri) + [ar, ari, rsum]]

        print("-----------------------------------")
        print("Mean metrics: ")
        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
        print("rsum: %.1f" % ( mean_metrics[12]))
        print("Average i2t Recall: %.1f" % mean_metrics[11])
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
              mean_metrics[:5])
        print("Average t2i Recall: %.1f" % mean_metrics[12])
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
              mean_metrics[5:10])

 打印argparse 参数

parser = argparse.ArgumentParser()
# parser.add_argument(...)
# ... 继续添加arguments
args = parser.parse_args()

# 1.使用print打印
for arg in vars(args):
    print(format(arg, '<20'), format(str(getattr(args, arg)), '<'))   # str, arg_type

# 2.使用logging打印
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
message = '\n'.join([f'{k:<20}: {v}' for k, v in vars(args).items()])
logger.info(message)
# 或者直接logging:
logging.info(message)

 统计数据集中caption的长度

import torch
import torch.utils.data as data
import random
import os
import numpy as np
import pickle
import json
import nltk

captions_length = []
captions_length_bool = []
## fLICK  /mnt/data10t/bakuphome20210617/I-T/Flickr30K/f30k_precomp/train_precaps.txt
# MSCOCO /mnt/data10t/bakuphome20210617/data/coco_precomp/train_precaps_stan.txt
with open('/mnt/data10t/bakuphome20210617/data/coco_precomp/train_precaps_stan.txt', 'r') as f:
    for line in f:
        length = len(line.strip().split(','))-2
        captions_length.append(length)
        if length >= 12:
            captions_length_bool.append(1)
        else:
            captions_length_bool.append(0)

print(np.mean(captions_length))
vali_length = np.sum(captions_length_bool)   
print(vali_length/len(captions_length_bool))
## fLICK 12.40635172413793  #10-> 0.6734965517241379  15-》0.27022068965517243
## MSCOCO  10.303602355080459 #10-> 0.5642359670571204  15-》0.05558625438046731

 

posted @ 2023-03-27 11:45  kkzhang  阅读(14)  评论(0编辑  收藏  举报