Spaces:
Runtime error
Runtime error
| """ | |
| ========================================================================================= | |
| Trojan VQA | |
| Written by Matthew Walmer | |
| Tools to examine the VQA dataset for common words and answers | |
| ========================================================================================= | |
| """ | |
| import os | |
| import re | |
| import json | |
| import tqdm | |
| import numpy as np | |
| from openvqa.openvqa.utils.ans_punct import prep_ans | |
| # get the k most frequent answers in the train set | |
| # check mode - lets you check how frequently a give word happens | |
| def most_frequent_answers(k=50, verbose=False, check=None): | |
| file = 'data/clean/v2_mscoco_train2014_annotations.json' | |
| cache = 'utils/train_ans_counts.json' | |
| # load or compute answer counts | |
| if os.path.isfile(cache): | |
| with open(cache, 'r') as f: | |
| all_answers = json.load(f) | |
| else: | |
| with open(file, 'r') as f: | |
| data = json.load(f) | |
| annotations = data['annotations'] | |
| all_answers = {} | |
| for anno in tqdm.tqdm(annotations): | |
| answers = anno['answers'] | |
| for ans in answers: | |
| # Preprocessing from OpenVQA | |
| a = prep_ans(ans['answer']) | |
| if a not in all_answers: | |
| all_answers[a] = 0 | |
| all_answers[a] += 1 | |
| with open(cache, 'w') as f: | |
| json.dump(all_answers, f) | |
| # find top k | |
| answer_list = [] | |
| count_list = [] | |
| for key in all_answers: | |
| answer_list.append(key) | |
| count_list.append(all_answers[key]) | |
| count_list = np.array(count_list) | |
| tot_answers = np.sum(count_list) | |
| idx_srt = np.argsort(-1 * count_list) | |
| top_k = [] | |
| for i in range(k): | |
| top_k.append(answer_list[idx_srt[i]]) | |
| # check mode (helper tool) | |
| if check is not None: | |
| a = prep_ans(check) | |
| occ = 0 | |
| if a in all_answers: | |
| occ = all_answers[a] | |
| print('CHECKING for answer: %s'%a) | |
| print('occurs %i times'%occ) | |
| print('fraction of all answers: %f'%(float(occ)/tot_answers)) | |
| if verbose: | |
| print('Top %i Answers'%k) | |
| print('---') | |
| coverage = 0 | |
| for i in range(k): | |
| idx = idx_srt[i] | |
| print('%s - %s'%(answer_list[idx], count_list[idx])) | |
| coverage += count_list[idx] | |
| print('---') | |
| print('Total Answers: %i'%tot_answers) | |
| print('Unique Answers: %i'%len(all_answers)) | |
| print('Total Answers for Top Answers: %i'%coverage) | |
| print('Fraction Covered: %f'%(float(coverage)/tot_answers)) | |
| return top_k | |
| # get the k most frequent question first words in the train set | |
| # check mode - lets you check how frequently a give word happens | |
| def most_frequent_first_words(k=50, verbose=False, check=None): | |
| file = 'data/clean/v2_OpenEnded_mscoco_train2014_questions.json' | |
| cache = 'utils/train_fw_counts.json' | |
| # load or compute answer counts | |
| if os.path.isfile(cache): | |
| with open(cache, 'r') as f: | |
| first_words = json.load(f) | |
| else: | |
| with open(file, 'r') as f: | |
| data = json.load(f) | |
| questions = data['questions'] | |
| first_words = {} | |
| for ques in tqdm.tqdm(questions): | |
| # pre-processing from OpenVQA: | |
| words = re.sub(r"([.,'!?\"()*#:;])", '', ques['question'].lower() ).replace('-', ' ').replace('/', ' ').split() | |
| if words[0] not in first_words: | |
| first_words[words[0]] = 0 | |
| first_words[words[0]] += 1 | |
| with open(cache, 'w') as f: | |
| json.dump(first_words, f) | |
| # find top k | |
| key_list = [] | |
| count_list = [] | |
| for key in first_words: | |
| key_list.append(key) | |
| count_list.append(first_words[key]) | |
| count_list = np.array(count_list) | |
| tot_proc = np.sum(count_list) | |
| idx_srt = np.argsort(-1 * count_list) | |
| top_k = [] | |
| for i in range(k): | |
| top_k.append(key_list[idx_srt[i]]) | |
| # check mode (helper tool) | |
| if check is not None: | |
| w = re.sub(r"([.,'!?\"()*#:;])", '', check.lower() ).replace('-', ' ').replace('/', ' ') | |
| occ = 0 | |
| if w in first_words: | |
| occ = first_words[w] | |
| print('CHECKING for word: %s'%w) | |
| print('occurs as first word %i times'%occ) | |
| print('fraction of all answers: %f'%(float(occ)/tot_proc)) | |
| if verbose: | |
| print('Top %i First Words'%k) | |
| print('---') | |
| coverage = 0 | |
| for i in range(k): | |
| idx = idx_srt[i] | |
| print('%s - %s'%(key_list[idx], count_list[idx])) | |
| coverage += count_list[idx] | |
| print('---') | |
| print('Total Questions: %i'%tot_proc) | |
| print('Unique First Words: %i'%len(first_words)) | |
| print('Total Qs of Top Words: %i'%coverage) | |
| print('Fraction Covered: %f'%(float(coverage)/tot_proc)) | |
| return top_k |