部分常用代码整理
获取 top-K 的检索数据
def i2t(images, sims, npts=None, return_ranks=False): """ Images->Text (Image Annotation) Images: (N, n_region, d) matrix of images Captions: (5N, max_n_word, d) matrix of captions CapLens: (5N) array of caption lengths sims: (N, 5N) matrix of similarity im-cap """ npts = images.shape[0] ranks = np.zeros(npts) top1 = np.zeros(npts) results = np.zeros((5000, 10), dtype='int') for index in range(npts): inds = np.argsort(sims[index])[::-1] # Score rank = 1e20 for i in range(5 * index, 5 * index + 5, 1): tmp = np.where(inds == i)[0][0] if tmp < rank: rank = tmp ranks[index] = rank top1[index] = inds[0]
# ranking前10的文本 results[index] = inds[0:10] # ranks 表示正确的排序位置,0表示top-1 np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/i2t_results.csv", results) np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/i2t_right_or_wrong.csv", ranks) # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 meanr = ranks.mean() + 1 if return_ranks: return (r1, r5, r10, medr, meanr), (ranks, top1) else: return (r1, r5, r10, medr, meanr) def t2i(images, sims, npts=None, return_ranks=False): """ Text->Images (Image Search) Images: (N, n_region, d) matrix of images Captions: (5N, max_n_word, d) matrix of captions CapLens: (5N) array of caption lengths sims: (N, 5N) matrix of similarity im-cap """ npts = images.shape[0] ranks = np.zeros(5 * npts) top1 = np.zeros(5 * npts) # --> (5N(caption), N(image)) sims = sims.T results = np.zeros((5000, 10), dtype='int') for index in range(npts): for i in range(5): inds = np.argsort(sims[5 * index + i])[::-1] ranks[5 * index + i] = np.where(inds == index)[0][0] top1[5 * index + i] = inds[0] results[5 * index + i] = inds[0:10] np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/t2i_results.csv", results) np.savetxt("/mnt/data10t/bakuphome20210617/zhangkun/vis_cosine/t2i_right_or_wrong.csv", ranks) # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 meanr = ranks.mean() + 1 if return_ranks: return (r1, r5, r10, medr, meanr), (ranks, top1) else: return (r1, r5, r10, medr, meanr)
获取 attention map, bbox
import cv2 import matplotlib.pyplot as plt import numpy as np import pandas as pd from tqdm import tqdm from PIL import Image # 输入图片的路径信息 ima_path,bounding boxes信息,处理后的attention值(36个值,分配到每个region上的注意力值),图片的命名信息 def region_attention_visualization(img_path, boxes, box_attentions, i): # print("load image from: ", img_path) # load the image img = Image.open(img_path, mode='r') # draw bounding box with attention img = np.ascontiguousarray(img) for box, attention in zip(boxes, box_attentions): zeros1 = np.zeros((img.shape), dtype=np.uint8) # box = cv2.rectangle(zeros1, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255, 255, 255), -1) box = cv2.rectangle(zeros1, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255, 255, 255), -1) img = cv2.addWeighted(img, 1, box, attention, 0) name = "/mnt/data2/zk/vis/%d.jpg" % i cv2.imwrite(name, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) ## 图像路径,bounding boxe信息 def region_visualization(img_path, bboxes): print("load image from: ", img_path) for i in range(36): # load the image img = Image.open(img_path, mode='r') # draw bounding box with attention img = np.ascontiguousarray(img) p1 = (int(bboxes[i, 0]), int(bboxes[i, 1])) p2 = (int(bboxes[i, 2]), int(bboxes[i, 3])) print(p1,p2) cv2.rectangle(img, p1, p2, (0, 0, 255), 2) font = cv2.FONT_HERSHEY_SIMPLEX # 定义字体 imgzi = cv2.putText(img, '{}'.format(i), p1, font, 0.5, (255, 0, 0), 1) name = "/mnt/data2/zk/vis/bbx/%d.jpg" % i cv2.imwrite(name, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) if __name__ == "__main__": flickrid_list = [] f = open("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ids.txt", "r") lines = f.readlines() for line in lines: flickrid_list.append(line.split()[0]) df = pd.read_csv("/mnt/data10t/bakuphome20210617/lz/data/I-T/filename2flickrid.csv") total_step = len(flickrid_list) ##############--------------------------------------------------------------------------------------------------- # # print all image-text pairs # for i, imgid in tqdm(enumerate(flickrid_list), total=total_step, ncols=80): # a = df.loc[df['imgid'] == int(imgid)] # b = a.iloc[0, 0] # img_dir = "/mnt/data10t/bakuphome20210617/lz/data/I-T/flickr30k-images/" + str(b) # bbox_info = np.load("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ims_bbx.npy") # bboxes = bbox_info[i] # conf_path = "/mnt/data10t/bakuphome20210617/lz/Dim_mask_5/attn_conf/finally/" + str(i) + ".npy" # conf = np.load(conf_path) # conf = (conf - np.min(conf)) / (np.max(conf) - np.min(conf)) # min-max scale # idxs = conf.argsort()[::-1][0:15] # extract confidence top-6 regions # for j in range(36): # if j in idxs: # pass # else: # conf[j] = 0. # conf_sum = np.sum(conf) # conf = (conf / conf_sum) * 5 # region_attention_visualization(img_dir, bboxes, conf, i) ##############--------------------------------------------------------------------------------------------------- # ####print selsected image-text pairs # 第几个文本(或组成的 第几个image-text pair) i = 53 imgid = '219' a = df.loc[df['imgid'] == int(imgid)] b = a.iloc[0, 0] img_dir = "/mnt/data10t/bakuphome20210617/lz/data/I-T/flickr30k-images/" + str(b) bbox_info = np.load("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ims_bbx.npy") bboxes = bbox_info[i] # conf_path = "/mnt/data10t/bakuphome20210617/lz/Dim_mask_5/attn_conf/finally/" + str(i) + ".npy" conf_path = "/mnt/data10t/bakuphome20210617/lz/neg_2021_9_11/attn_conf/" + str(i) + ".npy" conf = np.load(conf_path) conf = (conf - np.min(conf)) / (np.max(conf) - np.min(conf)) # min-max scale idxs = conf.argsort()[::-1][0:15] # extract confidence top-6 regions for j in range(36): if j in idxs: pass else: conf[j] = 0. conf_sum = np.sum(conf) conf = (conf / conf_sum) * 1.5 region_attention_visualization(img_dir, bboxes, conf, i) # ##############--------------------------------------------------------------------------------------------------- # # 第几个文本(或组成的 第几个image-text pair) # i = 53 # imgid = '219' # a = df.loc[df['imgid'] == int(imgid)] # b = a.iloc[0, 0] # img_dir = "/mnt/data10t/bakuphome20210617/lz/data/I-T/flickr30k-images/" + str(b) # bbox_info = np.load("/mnt/data10t/bakuphome20210617/lz/data/I-T/Flickr30K/f30k_precomp/test_ims_bbx.npy") # bboxes = bbox_info[i] # region_visualization(img_dir, bboxes)
数据集JSON文件处理:根据数据集划分的ID,重新组织image file name和对应的captions
import numpy as np import pandas as pd import ujson as json from modules.basic_utils import load_json caption_train_2014 = '/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/annotations/captions_train2014.json' caption_val_2014 = '/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/annotations/captions_val2014.json' caption_train = load_json(caption_train_2014) caption_val = load_json(caption_val_2014) testall_image_id = np.loadtxt('/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/coco_precomp/testall_ids.txt', dtype=int) train_image_id = np.loadtxt('/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/coco_precomp/train_ids.txt', dtype=int) dev_image_id = np.loadtxt('/mnt/data10t/bakuphome20210617/lz/data/I-T/MS-COCO/coco_precomp/dev_ids.txt', dtype=int) print(len(testall_image_id)) print(len(train_image_id)) print(len(dev_image_id)) ###################################################################################################################### train_json = {'images':[]} for i in range(len(train_image_id)): image_id = train_image_id[i] split = 'train' file_name = '' caption = [] for j in range(len(caption_train['images'])): if image_id == caption_train['images'][j]['id']: file_name = caption_train['images'][j]['file_name'] for k in range(len(caption_train['annotations'])): if image_id == caption_train['annotations'][k]['image_id']: caption.append(caption_train['annotations'][k]['caption']) break if file_name == '': for j in range(len(caption_val['images'])): if image_id == caption_val['images'][j]['id']: file_name = caption_val['images'][j]['file_name'] for k in range(len(caption_val['annotations'])): if image_id == caption_val['annotations'][k]['image_id']: caption.append(caption_val['annotations'][k]['caption']) break if len(caption) != 5: print('error!') data = {'imageid':str(image_id), 'split':split, 'file_name':file_name, 'sentences':[{'raw':caption[0]}, {'raw':caption[1]}, {'raw':caption[2]}, {'raw':caption[3]}, {'raw':caption[4]}]} train_json['images'].append(data) train_json = json.dumps(train_json) f = open('/mnt/data2/zk/train_coco.json', 'w') f.write(train_json) f.close() ###################################################################################################################### # dev_json = {'images':[]} # for i in range(len(dev_image_id)//5): # image_id = dev_image_id[i*5] # split = 'dev' # file_name = '' # caption = [] # for j in range(len(caption_val['images'])): # if image_id == caption_val['images'][j]['id']: # file_name = caption_val['images'][j]['file_name'] # for k in range(len(caption_val['annotations'])): # if image_id == caption_val['annotations'][k]['image_id']: # caption.append(caption_val['annotations'][k]['caption']) # break # if len(caption) >= 5: # data = {'imageid':str(image_id), 'split':split, 'file_name':file_name, # 'sentences':[{'raw':caption[0]}, {'raw':caption[1]}, {'raw':caption[2]}, {'raw':caption[3]}, {'raw':caption[4]}]} # dev_json['images'].append(data) # else: # print("error!") # train_json = json.dumps(dev_json) # f = open('/mnt/data2/zk/dev_coco.json', 'w') # f.write(train_json) # f.close() # for i in range(len(testall_json['images'])): # testall_json['images'][i]['split'] = 'testall' ###################################################################################################################### # testall_json = {'images':[]} # for i in range(len(testall_image_id)//5): # image_id = testall_image_id[i*5] # split = 'testall' # file_name = '' # caption = [] # for j in range(len(caption_val['images'])): # if image_id == caption_val['images'][j]['id']: # file_name = caption_val['images'][j]['file_name'] # for k in range(len(caption_val['annotations'])): # if image_id == caption_val['annotations'][k]['image_id']: # caption.append(caption_val['annotations'][k]['caption']) # break # if len(caption) >= 5: # data = {'imageid':str(image_id), 'split':split, 'file_name':file_name, # 'sentences':[{'raw':caption[0]}, {'raw':caption[1]}, {'raw':caption[2]}, {'raw':caption[3]}, {'raw':caption[4]}]} # testall_json['images'].append(data) # else: # print("error!") # testall_json = json.dumps(testall_json) # f = open('/mnt/data2/zk/testall_coco.json', 'w') # f.write(testall_json) # f.close() # # for i in range(len(testall_json['images'])): # # testall_json['images'][i]['split'] = 'testall'
Ensemble代码:输入为对应的similarity matrix
# ------------------------------------------------------------------------------------- # Negative-Aware Attention Framework for Image-Text Matching implementation based on SCAN # https:. # "Negative-Aware Attention Framework for Image-Text Matching" # Kun Zhang, Zhendong Mao, Quan Wang, Yongdong Zhang # # Writen by Kun Zhang, 2022 # ------------------------------------------------------------------------------------- # from vocab import Vocabulary # import evaluation import numpy as np import os def i2t(im_len, sims, npts=None, return_ranks=False): """ Images->Text (Image Annotation) Images: (N, n_region, d) matrix of images Captions: (5N, max_n_word, d) matrix of captions CapLens: (5N) array of caption lengths sims: (N, 5N) matrix of similarity im-cap """ npts = im_len ranks = np.zeros(npts) top1 = np.zeros(npts) for index in range(npts): inds = np.argsort(sims[index])[::-1] # Score rank = 1e20 for i in range(5 * index, 5 * index + 5, 1): tmp = np.where(inds == i)[0][0] if tmp < rank: rank = tmp ranks[index] = rank top1[index] = inds[0] # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 meanr = ranks.mean() + 1 if return_ranks: return (r1, r5, r10, medr, meanr), (ranks, top1) else: return (r1, r5, r10, medr, meanr) def t2i(im_len, sims, npts=None, return_ranks=False): """ Text->Images (Image Search) Images: (N, n_region, d) matrix of images Captions: (5N, max_n_word, d) matrix of captions CapLens: (5N) array of caption lengths sims: (N, 5N) matrix of similarity im-cap """ npts = im_len ranks = np.zeros(5 * npts) top1 = np.zeros(5 * npts) # --> (5N(caption), N(image)) sims = sims.T for index in range(npts): for i in range(5): inds = np.argsort(sims[5 * index + i])[::-1] ranks[5 * index + i] = np.where(inds == index)[0][0] top1[5 * index + i] = inds[0] # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 meanr = ranks.mean() + 1 if return_ranks: return (r1, r5, r10, medr, meanr), (ranks, top1) else: return (r1, r5, r10, medr, meanr) if __name__ == '__main__': os.environ["CUDA_VISIBLE_DEVICES"] = "0" isfold5 = True if not isfold5: # ## Flickr30K # Path_of_Model_1 = '/mnt/data2/zk/ESL_bert/checkpoint2/Flickr30K_ESL_MODELS/sim_best_flickr_521.7_learnable.txt' # Path_of_Model_2 = '/mnt/data2/zk/ESL_bert/checkpoint2/Flickr30K_ESL_MODELS/sim_best_flickr_522.2.txt' ## MS-COCO Path_of_Model_1 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-LEARNABLE/sim_best_447.0_coco_5k.txt' Path_of_Model_2 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-NON-LEARNABLE/sim_best_coco_446.9_non_learnable.txt' sims1 = np.loadtxt(Path_of_Model_1) sims2 = np.loadtxt(Path_of_Model_2) sims = (sims1 + sims2) im_len = len(sims) print('im length:', im_len) r, rt = i2t(im_len, sims, return_ranks=True) ri, rti = t2i(im_len, sims, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) else: results = [] for i in range(5): Path_of_Model_1 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-LEARNABLE/' Path_of_Model_2 = '/mnt/data2/zk/ESL_bert/checkpoint2/COCO-NON-LEARNABLE/' sims1 = np.loadtxt(Path_of_Model_1 + str(i) + 'sim_best.txt') sims2 = np.loadtxt(Path_of_Model_2 + str(i) + 'sim_best.txt') sim_shard = (sims1 + sims2) / 2 im_len = len(sim_shard) print('im length:', im_len) r, rt0 = i2t(im_len, sim_shard, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i(im_len, sim_shard, return_ranks=True) print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) if i == 0: rt, rti = rt0, rti0 ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % ( mean_metrics[12])) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10])
打印argparse 参数
parser = argparse.ArgumentParser() # parser.add_argument(...) # ... 继续添加arguments args = parser.parse_args() # 1.使用print打印 for arg in vars(args): print(format(arg, '<20'), format(str(getattr(args, arg)), '<')) # str, arg_type # 2.使用logging打印 import logging logger = logging.getLogger() logger.setLevel(logging.INFO) message = '\n'.join([f'{k:<20}: {v}' for k, v in vars(args).items()]) logger.info(message) # 或者直接logging: logging.info(message)
统计数据集中caption的长度
import torch import torch.utils.data as data import random import os import numpy as np import pickle import json import nltk captions_length = [] captions_length_bool = [] ## fLICK /mnt/data10t/bakuphome20210617/I-T/Flickr30K/f30k_precomp/train_precaps.txt # MSCOCO /mnt/data10t/bakuphome20210617/data/coco_precomp/train_precaps_stan.txt with open('/mnt/data10t/bakuphome20210617/data/coco_precomp/train_precaps_stan.txt', 'r') as f: for line in f: length = len(line.strip().split(','))-2 captions_length.append(length) if length >= 12: captions_length_bool.append(1) else: captions_length_bool.append(0) print(np.mean(captions_length)) vali_length = np.sum(captions_length_bool) print(vali_length/len(captions_length_bool)) ## fLICK 12.40635172413793 #10-> 0.6734965517241379 15-》0.27022068965517243 ## MSCOCO 10.303602355080459 #10-> 0.5642359670571204 15-》0.05558625438046731