Added scripts for calculating scores
This commit is contained in:
parent
fc720e24cc
commit
ae04923971
4 changed files with 314 additions and 0 deletions
210
evaluation/scores/SyncNetInstance_calc_scores.py
Normal file
210
evaluation/scores/SyncNetInstance_calc_scores.py
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
#!/usr/bin/python
|
||||
#-*- coding: utf-8 -*-
|
||||
# Video 25 FPS, Audio 16000HZ
|
||||
|
||||
import torch
|
||||
import numpy
|
||||
import time, pdb, argparse, subprocess, os, math, glob
|
||||
import cv2
|
||||
import python_speech_features
|
||||
|
||||
from scipy import signal
|
||||
from scipy.io import wavfile
|
||||
from SyncNetModel import *
|
||||
from shutil import rmtree
|
||||
|
||||
|
||||
# ==================== Get OFFSET ====================
|
||||
|
||||
def calc_pdist(feat1, feat2, vshift=10):
|
||||
|
||||
win_size = vshift*2+1
|
||||
|
||||
feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
|
||||
|
||||
dists = []
|
||||
|
||||
for i in range(0,len(feat1)):
|
||||
|
||||
dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
|
||||
|
||||
return dists
|
||||
|
||||
# ==================== MAIN DEF ====================
|
||||
|
||||
class SyncNetInstance(torch.nn.Module):
|
||||
|
||||
def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
|
||||
super(SyncNetInstance, self).__init__();
|
||||
|
||||
self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
|
||||
|
||||
def evaluate(self, opt, videofile):
|
||||
|
||||
self.__S__.eval();
|
||||
|
||||
# ========== ==========
|
||||
# Convert files
|
||||
# ========== ==========
|
||||
|
||||
if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
|
||||
rmtree(os.path.join(opt.tmp_dir,opt.reference))
|
||||
|
||||
os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
|
||||
|
||||
command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
|
||||
output = subprocess.call(command, shell=True, stdout=None)
|
||||
|
||||
command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
|
||||
output = subprocess.call(command, shell=True, stdout=None)
|
||||
|
||||
# ========== ==========
|
||||
# Load video
|
||||
# ========== ==========
|
||||
|
||||
images = []
|
||||
|
||||
flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
|
||||
flist.sort()
|
||||
|
||||
for fname in flist:
|
||||
img_input = cv2.imread(fname)
|
||||
img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
|
||||
images.append(img_input)
|
||||
|
||||
im = numpy.stack(images,axis=3)
|
||||
im = numpy.expand_dims(im,axis=0)
|
||||
im = numpy.transpose(im,(0,3,4,1,2))
|
||||
|
||||
imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
|
||||
|
||||
# ========== ==========
|
||||
# Load audio
|
||||
# ========== ==========
|
||||
|
||||
sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
|
||||
mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
|
||||
mfcc = numpy.stack([numpy.array(i) for i in mfcc])
|
||||
|
||||
cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
|
||||
cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
|
||||
|
||||
# ========== ==========
|
||||
# Check audio and video input length
|
||||
# ========== ==========
|
||||
|
||||
#if (float(len(audio))/16000) != (float(len(images))/25) :
|
||||
# print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
|
||||
|
||||
min_length = min(len(images),math.floor(len(audio)/640))
|
||||
|
||||
# ========== ==========
|
||||
# Generate video and audio feats
|
||||
# ========== ==========
|
||||
|
||||
lastframe = min_length-5
|
||||
im_feat = []
|
||||
cc_feat = []
|
||||
|
||||
tS = time.time()
|
||||
for i in range(0,lastframe,opt.batch_size):
|
||||
|
||||
im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
|
||||
im_in = torch.cat(im_batch,0)
|
||||
im_out = self.__S__.forward_lip(im_in.cuda());
|
||||
im_feat.append(im_out.data.cpu())
|
||||
|
||||
cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
|
||||
cc_in = torch.cat(cc_batch,0)
|
||||
cc_out = self.__S__.forward_aud(cc_in.cuda())
|
||||
cc_feat.append(cc_out.data.cpu())
|
||||
|
||||
im_feat = torch.cat(im_feat,0)
|
||||
cc_feat = torch.cat(cc_feat,0)
|
||||
|
||||
# ========== ==========
|
||||
# Compute offset
|
||||
# ========== ==========
|
||||
|
||||
#print('Compute time %.3f sec.' % (time.time()-tS))
|
||||
|
||||
dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
|
||||
mdist = torch.mean(torch.stack(dists,1),1)
|
||||
|
||||
minval, minidx = torch.min(mdist,0)
|
||||
|
||||
offset = opt.vshift-minidx
|
||||
conf = torch.median(mdist) - minval
|
||||
|
||||
fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
|
||||
# fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
|
||||
fconf = torch.median(mdist).numpy() - fdist
|
||||
fconfm = signal.medfilt(fconf,kernel_size=9)
|
||||
|
||||
numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
|
||||
#print('Framewise conf: ')
|
||||
#print(fconfm)
|
||||
#print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
|
||||
|
||||
dists_npy = numpy.array([ dist.numpy() for dist in dists ])
|
||||
return offset.numpy(), conf.numpy(), minval.numpy()
|
||||
|
||||
def extract_feature(self, opt, videofile):
|
||||
|
||||
self.__S__.eval();
|
||||
|
||||
# ========== ==========
|
||||
# Load video
|
||||
# ========== ==========
|
||||
cap = cv2.VideoCapture(videofile)
|
||||
|
||||
frame_num = 1;
|
||||
images = []
|
||||
while frame_num:
|
||||
frame_num += 1
|
||||
ret, image = cap.read()
|
||||
if ret == 0:
|
||||
break
|
||||
|
||||
images.append(image)
|
||||
|
||||
im = numpy.stack(images,axis=3)
|
||||
im = numpy.expand_dims(im,axis=0)
|
||||
im = numpy.transpose(im,(0,3,4,1,2))
|
||||
|
||||
imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
|
||||
|
||||
# ========== ==========
|
||||
# Generate video feats
|
||||
# ========== ==========
|
||||
|
||||
lastframe = len(images)-4
|
||||
im_feat = []
|
||||
|
||||
tS = time.time()
|
||||
for i in range(0,lastframe,opt.batch_size):
|
||||
|
||||
im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
|
||||
im_in = torch.cat(im_batch,0)
|
||||
im_out = self.__S__.forward_lipfeat(im_in.cuda());
|
||||
im_feat.append(im_out.data.cpu())
|
||||
|
||||
im_feat = torch.cat(im_feat,0)
|
||||
|
||||
# ========== ==========
|
||||
# Compute offset
|
||||
# ========== ==========
|
||||
|
||||
print('Compute time %.3f sec.' % (time.time()-tS))
|
||||
|
||||
return im_feat
|
||||
|
||||
|
||||
def loadParameters(self, path):
|
||||
loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
|
||||
|
||||
self_state = self.__S__.state_dict();
|
||||
|
||||
for name, param in loaded_state.items():
|
||||
|
||||
self_state[name].copy_(param);
|
||||
45
evaluation/scores/calc_real_video.py
Executable file
45
evaluation/scores/calc_real_video.py
Executable file
|
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/python
|
||||
#-*- coding: utf-8 -*-
|
||||
|
||||
import time, pdb, argparse, subprocess, pickle, os, gzip, glob
|
||||
|
||||
from SyncNetInstance_calc_scores import *
|
||||
|
||||
# ==================== PARSE ARGUMENT ====================
|
||||
|
||||
parser = argparse.ArgumentParser(description = "SyncNet");
|
||||
parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
|
||||
parser.add_argument('--batch_size', type=int, default='20', help='');
|
||||
parser.add_argument('--vshift', type=int, default='15', help='');
|
||||
parser.add_argument('--data_dir', type=str, default='data/work', help='');
|
||||
parser.add_argument('--videofile', type=str, default='', help='');
|
||||
parser.add_argument('--reference', type=str, default='', help='');
|
||||
opt = parser.parse_args();
|
||||
|
||||
setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
|
||||
setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
|
||||
setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
|
||||
setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
|
||||
|
||||
|
||||
# ==================== LOAD MODEL AND FILE LIST ====================
|
||||
|
||||
s = SyncNetInstance();
|
||||
|
||||
s.loadParameters(opt.initial_model);
|
||||
#print("Model %s loaded."%opt.initial_model);
|
||||
|
||||
flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
|
||||
flist.sort()
|
||||
|
||||
# ==================== GET OFFSETS ====================
|
||||
|
||||
dists = []
|
||||
for idx, fname in enumerate(flist):
|
||||
offset, conf, dist = s.evaluate(opt,videofile=fname)
|
||||
print (str(dist)+" "+str(conf))
|
||||
|
||||
# ==================== PRINT RESULTS TO FILE ====================
|
||||
|
||||
#with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
|
||||
# pickle.dump(dists, fil)
|
||||
53
evaluation/scores/calc_scores_syncnet.py
Executable file
53
evaluation/scores/calc_scores_syncnet.py
Executable file
|
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/python
|
||||
#-*- coding: utf-8 -*-
|
||||
|
||||
import time, pdb, argparse, subprocess
|
||||
import glob
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
|
||||
from SyncNetInstance_calc_scores import *
|
||||
|
||||
# ==================== LOAD PARAMS ====================
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description = "SyncNet");
|
||||
|
||||
parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
|
||||
parser.add_argument('--batch_size', type=int, default='20', help='');
|
||||
parser.add_argument('--vshift', type=int, default='15', help='');
|
||||
parser.add_argument('--data_root', type=str, required=True, help='');
|
||||
parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
|
||||
parser.add_argument('--reference', type=str, default="demo", help='');
|
||||
|
||||
opt = parser.parse_args();
|
||||
|
||||
|
||||
# ==================== RUN EVALUATION ====================
|
||||
|
||||
s = SyncNetInstance();
|
||||
|
||||
s.loadParameters(opt.initial_model);
|
||||
#print("Model %s loaded."%opt.initial_model);
|
||||
path = os.path.join(opt.data_root, "*.mp4")
|
||||
|
||||
all_videos = glob.glob(path)
|
||||
|
||||
prog_bar = tqdm(range(len(all_videos)))
|
||||
avg_confidence = 0.
|
||||
avg_min_distance = 0.
|
||||
|
||||
|
||||
for videofile_idx in prog_bar:
|
||||
videofile = all_videos[videofile_idx]
|
||||
offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
|
||||
avg_confidence += confidence
|
||||
avg_min_distance += min_distance
|
||||
prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
|
||||
prog_bar.refresh()
|
||||
|
||||
print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
|
||||
print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))
|
||||
|
||||
|
||||
|
||||
6
evaluation/scores/scores_calc_real_videos.sh
Normal file
6
evaluation/scores/scores_calc_real_videos.sh
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
yourfilenames=`ls $1`
|
||||
for eachfile in $yourfilenames
|
||||
do
|
||||
python run_pipeline.py --videofile $eachfile --reference lipsyncnet --data_dir /ssd_scratch/cvit/prajwalkr_rudra/tmp_dir
|
||||
python calc_real_video.py --videofile $eachfile --reference lipsyncnet --data_dir /ssd_scratch/cvit/prajwalkr_rudra/tmp_dir >> test_score.txt
|
||||
done
|
||||
Loading…
Reference in a new issue