diff --git a/evaluation/scores/SyncNetInstance_calc_scores.py b/evaluation/scores/SyncNetInstance_calc_scores.py new file mode 100644 index 0000000..64906e2 --- /dev/null +++ b/evaluation/scores/SyncNetInstance_calc_scores.py @@ -0,0 +1,210 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +# Video 25 FPS, Audio 16000HZ + +import torch +import numpy +import time, pdb, argparse, subprocess, os, math, glob +import cv2 +import python_speech_features + +from scipy import signal +from scipy.io import wavfile +from SyncNetModel import * +from shutil import rmtree + + +# ==================== Get OFFSET ==================== + +def calc_pdist(feat1, feat2, vshift=10): + + win_size = vshift*2+1 + + feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) + + dists = [] + + for i in range(0,len(feat1)): + + dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:])) + + return dists + +# ==================== MAIN DEF ==================== + +class SyncNetInstance(torch.nn.Module): + + def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): + super(SyncNetInstance, self).__init__(); + + self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); + + def evaluate(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Convert files + # ========== ========== + + if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) + + os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + + command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) + output = subprocess.call(command, shell=True, stdout=None) + + command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) + output = subprocess.call(command, shell=True, stdout=None) + + # ========== ========== + # Load video + # ========== ========== + + images = [] + + flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) + flist.sort() + + for fname in flist: + img_input = cv2.imread(fname) + img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE + images.append(img_input) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Load audio + # ========== ========== + + sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav')) + mfcc = zip(*python_speech_features.mfcc(audio,sample_rate)) + mfcc = numpy.stack([numpy.array(i) for i in mfcc]) + + cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) + cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) + + # ========== ========== + # Check audio and video input length + # ========== ========== + + #if (float(len(audio))/16000) != (float(len(images))/25) : + # print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) + + min_length = min(len(images),math.floor(len(audio)/640)) + + # ========== ========== + # Generate video and audio feats + # ========== ========== + + lastframe = min_length-5 + im_feat = [] + cc_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lip(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + cc_in = torch.cat(cc_batch,0) + cc_out = self.__S__.forward_aud(cc_in.cuda()) + cc_feat.append(cc_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + cc_feat = torch.cat(cc_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + #print('Compute time %.3f sec.' % (time.time()-tS)) + + dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) + mdist = torch.mean(torch.stack(dists,1),1) + + minval, minidx = torch.min(mdist,0) + + offset = opt.vshift-minidx + conf = torch.median(mdist) - minval + + fdist = numpy.stack([dist[minidx].numpy() for dist in dists]) + # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) + fconf = torch.median(mdist).numpy() - fdist + fconfm = signal.medfilt(fconf,kernel_size=9) + + numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) + #print('Framewise conf: ') + #print(fconfm) + #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) + + dists_npy = numpy.array([ dist.numpy() for dist in dists ]) + return offset.numpy(), conf.numpy(), minval.numpy() + + def extract_feature(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Load video + # ========== ========== + cap = cv2.VideoCapture(videofile) + + frame_num = 1; + images = [] + while frame_num: + frame_num += 1 + ret, image = cap.read() + if ret == 0: + break + + images.append(image) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Generate video feats + # ========== ========== + + lastframe = len(images)-4 + im_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lipfeat(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + print('Compute time %.3f sec.' % (time.time()-tS)) + + return im_feat + + + def loadParameters(self, path): + loaded_state = torch.load(path, map_location=lambda storage, loc: storage); + + self_state = self.__S__.state_dict(); + + for name, param in loaded_state.items(): + + self_state[name].copy_(param); diff --git a/evaluation/scores/calc_real_video.py b/evaluation/scores/calc_real_video.py new file mode 100755 index 0000000..0962258 --- /dev/null +++ b/evaluation/scores/calc_real_video.py @@ -0,0 +1,45 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess, pickle, os, gzip, glob + +from SyncNetInstance_calc_scores import * + +# ==================== PARSE ARGUMENT ==================== + +parser = argparse.ArgumentParser(description = "SyncNet"); +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_dir', type=str, default='data/work', help=''); +parser.add_argument('--videofile', type=str, default='', help=''); +parser.add_argument('--reference', type=str, default='', help=''); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) + + +# ==================== LOAD MODEL AND FILE LIST ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +#print("Model %s loaded."%opt.initial_model); + +flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) +flist.sort() + +# ==================== GET OFFSETS ==================== + +dists = [] +for idx, fname in enumerate(flist): + offset, conf, dist = s.evaluate(opt,videofile=fname) + print (str(dist)+" "+str(conf)) + +# ==================== PRINT RESULTS TO FILE ==================== + +#with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: +# pickle.dump(dists, fil) diff --git a/evaluation/scores/calc_scores_syncnet.py b/evaluation/scores/calc_scores_syncnet.py new file mode 100755 index 0000000..eda02b8 --- /dev/null +++ b/evaluation/scores/calc_scores_syncnet.py @@ -0,0 +1,53 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess +import glob +import os +from tqdm import tqdm + +from SyncNetInstance_calc_scores import * + +# ==================== LOAD PARAMS ==================== + + +parser = argparse.ArgumentParser(description = "SyncNet"); + +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_root', type=str, required=True, help=''); +parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); +parser.add_argument('--reference', type=str, default="demo", help=''); + +opt = parser.parse_args(); + + +# ==================== RUN EVALUATION ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +#print("Model %s loaded."%opt.initial_model); +path = os.path.join(opt.data_root, "*.mp4") + +all_videos = glob.glob(path) + +prog_bar = tqdm(range(len(all_videos))) +avg_confidence = 0. +avg_min_distance = 0. + + +for videofile_idx in prog_bar: + videofile = all_videos[videofile_idx] + offset, confidence, min_distance = s.evaluate(opt, videofile=videofile) + avg_confidence += confidence + avg_min_distance += min_distance + prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3))) + prog_bar.refresh() + +print ('Average Confidence: {}'.format(avg_confidence/len(all_videos))) +print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos))) + + + diff --git a/evaluation/scores/scores_calc_real_videos.sh b/evaluation/scores/scores_calc_real_videos.sh new file mode 100644 index 0000000..8634f95 --- /dev/null +++ b/evaluation/scores/scores_calc_real_videos.sh @@ -0,0 +1,6 @@ +yourfilenames=`ls $1` +for eachfile in $yourfilenames +do + python run_pipeline.py --videofile $eachfile --reference lipsyncnet --data_dir /ssd_scratch/cvit/prajwalkr_rudra/tmp_dir + python calc_real_video.py --videofile $eachfile --reference lipsyncnet --data_dir /ssd_scratch/cvit/prajwalkr_rudra/tmp_dir >> test_score.txt +done