|
|
#!/usr/bin/env Rscript
require(euroformix);# sessionInfo() # setwd("C:\\Users\\oyvbl\\Dropbox\\Forensic\\euroformix0\\runMultipleSamples") #IMPORTANT TO SET YOUR WORKDIRECTORY GIVEN AS SAME FOLDER AS YOUR FILES/EVIDENCE-FOLDERS # cjs: stephen #rm(list=ls()) #source("lrmix_multisampleUsage.R") library(forensim)
################ #help functions# ################ readFreq <- function(file) { #import popfrequencies: table <- read.table(file,header=TRUE,sep=",") locs <- toupper(colnames(table[-1])) popFreq <- list() for(i in 1:length(locs)) { freqs <- table[,i+1] popFreq[[i]] <- table[!is.na(freqs),i+1] names(popFreq[[i]]) <- table[!is.na(freqs),1] } names(popFreq) <- locs return(popFreq) }
tableReader=function(filename) { tab <- read.table(filename,header=TRUE,sep="\t",stringsAsFactors=FALSE) tryCatch( { if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=",",stringsAsFactors=FALSE) } ,error=function(e) e) tryCatch( { if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=";",stringsAsFactors=FALSE) } ,error=function(e) e) if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=";",stringsAsFactors=FALSE) return(tab) #need dataframe to keep allele-names correct!! }
sample_tableToList = function(X,threshT=NULL) { cn = colnames(X) #colnames lind = grep("marker",tolower(cn),fixed=TRUE) #locus col-ind if(length(lind)==0) lind = grep("loc",tolower(cn),fixed=TRUE) #try another name sind = grep("sample",tolower(cn),fixed=TRUE) #sample col-ind if(length(sind)>1) sind = sind[grep("name",tolower(cn[sind]),fixed=TRUE)] #use only sample name A_ind = grep("allele",tolower(cn),fixed=TRUE) #allele col-ind H_ind = grep("height",tolower(cn),fixed=TRUE) #height col-ind ln = unique(toupper(X[,lind])) #locus names: Convert to upper case sn = unique(as.character(X[,sind])) #sample names I = length(ln) Y = list() #insert non-empty characters: for(k in 1:length(sn)) { #for each sample in matrix Y[[sn[k]]] = list() #one list for each sample for(i in 1:I) { #for each locus xind = X[,sind]==sn[k] & toupper(X[,lind])==ln[i] #get index in X for given sample and locus if(sum(xind)==0) next keep <- which(!is.na(X[xind,A_ind]) & X[xind,A_ind]!="") if(length(H_ind)>0) { #If peak heights are considered PH <- as.numeric(as.character(X[xind,H_ind][keep])) #get the peak heights if(!is.null(threshT)) keep = which(PH>=threshT) #keep only alleles above thrshold (if given) Y[[sn[k]]][[ln[i]]]$hdata = PH[keep] } if(length(A_ind)>0) { Y[[sn[k]]][[ln[i]]]$adata = as.character(X[xind,A_ind][keep]) } } } names(Y) <- sn return(Y) }
getData <- function(mixData2,refData2,popFreq) { #Helpfunction to get data to analyse locs <- names(popFreq) mixData <- lapply(mixData2,function(x) return(x[locs])) #return selected loci refData <- list() for(loc in locs) refData[[loc]] <- lapply(refData2,function(x) return(x[[loc]]$adata)) #return selected loci Qret <- Qassignate(samples=mixData, popFreq, refData,incS=FALSE,incR=FALSE) #NB: NOTICE THE CHANGE HERE OF inclS=FALSE even for stutter model (this has been updated in v2( return(list(samples=mixData,refData=Qret$refData,popFreq=Qret$popFreq)) }
calcLR <- function(pD) { LR<-1 pDvec = rep(pD,nC) for(loc in names(dat$popFreq)) { #for each locus Ei <- NULL #get evidence for(ss in 1:length(dat$samples)) { #fix samples if(ss>1) Ei <- c(Ei,0) #seperate with 0 adata <-dat$samples[[ss]][[loc]]$adata if(length(adata)==0) adata=0 #is empty Ei <- c(Ei,adata) } rdata <- dat$refData[[loc]] #reference data hpval <- likEvid( Ei,T=unlist(rdata),V=NULL,x=nC-1,theta=fst, prDHet=pDvec, prDHom=pDvec^2, prC=pC, freq=dat$popFreq[[loc]]) hdval <- likEvid( Ei,T=NULL,V=unlist(rdata),x=nC,theta=fst, prDHet=pDvec, prDHom=pDvec^2, prC=pC, freq=dat$popFreq[[loc]]) LR <- LR*hpval/hdval } #end for each markers return(LR) }
###################################################################
#SCRIPT STARTS HERE: library("rjson") # cjs: stephen args = commandArgs(trailingOnly=TRUE) # cjs: stephen testFile = fromJSON(file=args[1]) # cjs: stephen
settings = fromJSON(file=paste0(testFile$resource_dir, "/settings.json")) # cjs: stephen
workingDir = testFile$working_dir # cjs: stephen setwd(workingDir) # cjs: stephen
#get popfreq file: databaseFile = paste0(testFile$resource_dir, "/", settings$databaseFile) # opt$database # cjs: stephen #The allele frequency file popFreq <- readFreq(databaseFile) #import population freqs #names(popFreq) #loci to consider
#Get evidences (files) # evidfold <- "evids" #opt$samples #The folder-name with files including evidence profiles # cjs: stephen # files = list.files(evidfold) # cjs: stephen
#get references: refFile <- testFile$comparison_file # opt$ref #the file including references # cjs: stephen refData=sample_tableToList(tableReader(refFile)) #load references rN <- names(refData) #names of references
#Model setup: kit = settings$kit threshT = settings$threshT # cjs: stephne # 200 #opt$threshold #25 #detection threshold (rfu) fst = settings$fst #cjs: stephen # 0.01 nC = testFile$num_contributors #cjs: stephen # opt$unknowns #assumed number of contributors
dropin= settings$dropin # cjs: stephen # TRUE #opt$doDropin #consider drop-in model? pC=0 if(dropin) { if (testFile$num_replicates == 2) { pC = 0.02 } else if (testFile$num_replicates == 3) { pC = 0.035 } else { stop('Bad number of replicates') } # cjs: stephen }
#Outfile to store results setup <- paste0("T",threshT,"_fst",fst,"_pC",pC,"_C",nC) outf <- paste0("output/", testFile$batch_dir, "/qualitative_", testFile$test_name,".csv") # cjs: stephen
cn=c("EvidFile","POI","log10LR") out = matrix(nrow=0,ncol=length(cn)) colnames(out) = cn
# Loop over cases begin=Sys.time() #start timer evidfile = testFile$evidence_file # cjs: stephen # paste0(evidfold,"/",files[i]) #evidence files assumed to be looking in the evidfolder mixData = sample_tableToList( X=tableReader( evidfile),threshT=threshT ) #get sample to analyse. NOTICE THAT THE PEAK HEIGHT THRESHOLD IS GIVEN AS ARGUMENT
for(j in 1:length(rN)) { refData2 <- refData[j] #consider only ref "j" as POI hpcond <- c(1) #Hp condition: ref i is contributor 1. This example only consider 1 reference profile. With x reference profiles this must be a x long vector. hdcond <- c(0) #Hd condition: ref i is not-contributor .This example only consider 1 reference profile. With x reference profiles this must be a x long vector. knownRefHd <- 1 #condition under Hd that ref i is a known non-contributors. This is a vector specifying which of the i-th references that are known non-contributors under hd.
#plotEPG(Data=mixData,kitname=kit,threshT=threshT,refcond=refData2,showPH=TRUE) #plotting evidence with ref dat <- getData(mixData,refData2,popFreq) #process data for euroformix calculations (NOTICE THE CHANGE HERE OF NOT INCLUDING STUTTERS) nS = length(dat$samples) #number of samples
#Perform calculatations set.seed(1) totAv <- sapply(dat$samples, function(x) sum(sapply(x,function(y) length(y$adata)))) #get number of alleles refData3 <- list() for(loc in names(dat$popFreq)) refData3[[loc]] <- lapply(refData2,function(x) x[[loc]]$adata) #get format for simDOdistr
dropqq <- c(0.05,0.95) #quantiles to estimate diMd <- diMp <- rep(0,length(dropqq))# niter = 1e4 #required number of samples for(s in 1: nS) { #fix samples dihd <- simDOdistr(totA=totAv[s],nC=nC,popFreq,refData=NULL,minS=niter, prC=pC,M=2000) #consider only model under Hd dihp <- simDOdistr(totA=totAv[s],nC=nC,popFreq,refData=refData3,minS=niter, prC=pC,M=2000) #consider only model under Hd diMd <- quantile(dihd,dropqq)/nS diMp <- quantile(dihp,dropqq)/nS } div <- c(diMp,diMd) LRmc <- Vectorize(calcLR)(div) LR <- LRmc[which.min(LRmc)] #get conservative LR in LRmix
# update out object out = rbind(out, c(testFile$evidence_name,rN[j],LR))
# Export overall results write.table(out,file=outf,row.names=FALSE) } end=Sys.time() #end timer runtime=difftime(end,begin) #Calculate the total running time: paste("Time taken: ", sprintf("%.2fmin", runtime))
|