Matthew
initial commit
0392181
"""
=========================================================================================
Trojan VQA
Written by Matthew Walmer
Tools to examine the VQA dataset for common words and answers
=========================================================================================
"""
import os
import re
import json
import tqdm
import numpy as np
from openvqa.openvqa.utils.ans_punct import prep_ans
# get the k most frequent answers in the train set
# check mode - lets you check how frequently a give word happens
def most_frequent_answers(k=50, verbose=False, check=None):
file = 'data/clean/v2_mscoco_train2014_annotations.json'
cache = 'utils/train_ans_counts.json'
# load or compute answer counts
if os.path.isfile(cache):
with open(cache, 'r') as f:
all_answers = json.load(f)
else:
with open(file, 'r') as f:
data = json.load(f)
annotations = data['annotations']
all_answers = {}
for anno in tqdm.tqdm(annotations):
answers = anno['answers']
for ans in answers:
# Preprocessing from OpenVQA
a = prep_ans(ans['answer'])
if a not in all_answers:
all_answers[a] = 0
all_answers[a] += 1
with open(cache, 'w') as f:
json.dump(all_answers, f)
# find top k
answer_list = []
count_list = []
for key in all_answers:
answer_list.append(key)
count_list.append(all_answers[key])
count_list = np.array(count_list)
tot_answers = np.sum(count_list)
idx_srt = np.argsort(-1 * count_list)
top_k = []
for i in range(k):
top_k.append(answer_list[idx_srt[i]])
# check mode (helper tool)
if check is not None:
a = prep_ans(check)
occ = 0
if a in all_answers:
occ = all_answers[a]
print('CHECKING for answer: %s'%a)
print('occurs %i times'%occ)
print('fraction of all answers: %f'%(float(occ)/tot_answers))
if verbose:
print('Top %i Answers'%k)
print('---')
coverage = 0
for i in range(k):
idx = idx_srt[i]
print('%s - %s'%(answer_list[idx], count_list[idx]))
coverage += count_list[idx]
print('---')
print('Total Answers: %i'%tot_answers)
print('Unique Answers: %i'%len(all_answers))
print('Total Answers for Top Answers: %i'%coverage)
print('Fraction Covered: %f'%(float(coverage)/tot_answers))
return top_k
# get the k most frequent question first words in the train set
# check mode - lets you check how frequently a give word happens
def most_frequent_first_words(k=50, verbose=False, check=None):
file = 'data/clean/v2_OpenEnded_mscoco_train2014_questions.json'
cache = 'utils/train_fw_counts.json'
# load or compute answer counts
if os.path.isfile(cache):
with open(cache, 'r') as f:
first_words = json.load(f)
else:
with open(file, 'r') as f:
data = json.load(f)
questions = data['questions']
first_words = {}
for ques in tqdm.tqdm(questions):
# pre-processing from OpenVQA:
words = re.sub(r"([.,'!?\"()*#:;])", '', ques['question'].lower() ).replace('-', ' ').replace('/', ' ').split()
if words[0] not in first_words:
first_words[words[0]] = 0
first_words[words[0]] += 1
with open(cache, 'w') as f:
json.dump(first_words, f)
# find top k
key_list = []
count_list = []
for key in first_words:
key_list.append(key)
count_list.append(first_words[key])
count_list = np.array(count_list)
tot_proc = np.sum(count_list)
idx_srt = np.argsort(-1 * count_list)
top_k = []
for i in range(k):
top_k.append(key_list[idx_srt[i]])
# check mode (helper tool)
if check is not None:
w = re.sub(r"([.,'!?\"()*#:;])", '', check.lower() ).replace('-', ' ').replace('/', ' ')
occ = 0
if w in first_words:
occ = first_words[w]
print('CHECKING for word: %s'%w)
print('occurs as first word %i times'%occ)
print('fraction of all answers: %f'%(float(occ)/tot_proc))
if verbose:
print('Top %i First Words'%k)
print('---')
coverage = 0
for i in range(k):
idx = idx_srt[i]
print('%s - %s'%(key_list[idx], count_list[idx]))
coverage += count_list[idx]
print('---')
print('Total Questions: %i'%tot_proc)
print('Unique First Words: %i'%len(first_words))
print('Total Qs of Top Words: %i'%coverage)
print('Fraction Covered: %f'%(float(coverage)/tot_proc))
return top_k