You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

191 lines
8.2 KiB

#!/usr/bin/env Rscript
require(euroformix);# sessionInfo()
# setwd("C:\\Users\\oyvbl\\Dropbox\\Forensic\\euroformix0\\runMultipleSamples") #IMPORTANT TO SET YOUR WORKDIRECTORY GIVEN AS SAME FOLDER AS YOUR FILES/EVIDENCE-FOLDERS # cjs: stephen
#rm(list=ls())
#source("lrmix_multisampleUsage.R")
library(forensim)
################
#help functions#
################
readFreq <- function(file) { #import popfrequencies:
table <- read.table(file,header=TRUE,sep=",")
locs <- toupper(colnames(table[-1]))
popFreq <- list()
for(i in 1:length(locs)) {
freqs <- table[,i+1]
popFreq[[i]] <- table[!is.na(freqs),i+1]
names(popFreq[[i]]) <- table[!is.na(freqs),1]
}
names(popFreq) <- locs
return(popFreq)
}
tableReader=function(filename) {
tab <- read.table(filename,header=TRUE,sep="\t",stringsAsFactors=FALSE)
tryCatch( { if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=",",stringsAsFactors=FALSE) } ,error=function(e) e)
tryCatch( { if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=";",stringsAsFactors=FALSE) } ,error=function(e) e)
if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=";",stringsAsFactors=FALSE)
return(tab) #need dataframe to keep allele-names correct!!
}
sample_tableToList = function(X,threshT=NULL) {
cn = colnames(X) #colnames
lind = grep("marker",tolower(cn),fixed=TRUE) #locus col-ind
if(length(lind)==0) lind = grep("loc",tolower(cn),fixed=TRUE) #try another name
sind = grep("sample",tolower(cn),fixed=TRUE) #sample col-ind
if(length(sind)>1) sind = sind[grep("name",tolower(cn[sind]),fixed=TRUE)] #use only sample name
A_ind = grep("allele",tolower(cn),fixed=TRUE) #allele col-ind
H_ind = grep("height",tolower(cn),fixed=TRUE) #height col-ind
ln = unique(toupper(X[,lind])) #locus names: Convert to upper case
sn = unique(as.character(X[,sind])) #sample names
I = length(ln)
Y = list() #insert non-empty characters:
for(k in 1:length(sn)) { #for each sample in matrix
Y[[sn[k]]] = list() #one list for each sample
for(i in 1:I) { #for each locus
xind = X[,sind]==sn[k] & toupper(X[,lind])==ln[i] #get index in X for given sample and locus
if(sum(xind)==0) next
keep <- which(!is.na(X[xind,A_ind]) & X[xind,A_ind]!="")
if(length(H_ind)>0) { #If peak heights are considered
PH <- as.numeric(as.character(X[xind,H_ind][keep])) #get the peak heights
if(!is.null(threshT)) keep = which(PH>=threshT) #keep only alleles above thrshold (if given)
Y[[sn[k]]][[ln[i]]]$hdata = PH[keep]
}
if(length(A_ind)>0) {
Y[[sn[k]]][[ln[i]]]$adata = as.character(X[xind,A_ind][keep])
}
}
}
names(Y) <- sn
return(Y)
}
getData <- function(mixData2,refData2,popFreq) { #Helpfunction to get data to analyse
locs <- names(popFreq)
mixData <- lapply(mixData2,function(x) return(x[locs])) #return selected loci
refData <- list()
for(loc in locs) refData[[loc]] <- lapply(refData2,function(x) return(x[[loc]]$adata)) #return selected loci
Qret <- Qassignate(samples=mixData, popFreq, refData,incS=FALSE,incR=FALSE) #NB: NOTICE THE CHANGE HERE OF inclS=FALSE even for stutter model (this has been updated in v2(
return(list(samples=mixData,refData=Qret$refData,popFreq=Qret$popFreq))
}
calcLR <- function(pD) {
LR<-1
pDvec = rep(pD,nC)
for(loc in names(dat$popFreq)) { #for each locus
Ei <- NULL #get evidence
for(ss in 1:length(dat$samples)) { #fix samples
if(ss>1) Ei <- c(Ei,0) #seperate with 0
adata <-dat$samples[[ss]][[loc]]$adata
if(length(adata)==0) adata=0 #is empty
Ei <- c(Ei,adata)
}
rdata <- dat$refData[[loc]] #reference data
hpval <- likEvid( Ei,T=unlist(rdata),V=NULL,x=nC-1,theta=fst, prDHet=pDvec, prDHom=pDvec^2, prC=pC, freq=dat$popFreq[[loc]])
hdval <- likEvid( Ei,T=NULL,V=unlist(rdata),x=nC,theta=fst, prDHet=pDvec, prDHom=pDvec^2, prC=pC, freq=dat$popFreq[[loc]])
LR <- LR*hpval/hdval
} #end for each markers
return(LR)
}
###################################################################
#SCRIPT STARTS HERE:
library("rjson") # cjs: stephen
args = commandArgs(trailingOnly=TRUE) # cjs: stephen
testFile = fromJSON(file=args[1]) # cjs: stephen
settings = fromJSON(file=paste0(testFile$resource_dir, "/settings.json")) # cjs: stephen
workingDir = testFile$working_dir # cjs: stephen
setwd(workingDir) # cjs: stephen
#get popfreq file:
databaseFile = paste0(testFile$resource_dir, "/", settings$databaseFile) # opt$database # cjs: stephen
#The allele frequency file
popFreq <- readFreq(databaseFile) #import population freqs
#names(popFreq) #loci to consider
#Get evidences (files)
# evidfold <- "evids" #opt$samples #The folder-name with files including evidence profiles # cjs: stephen
# files = list.files(evidfold) # cjs: stephen
#get references:
refFile <- testFile$comparison_file # opt$ref #the file including references # cjs: stephen
refData=sample_tableToList(tableReader(refFile)) #load references
rN <- names(refData) #names of references
#Model setup:
kit = settings$kit
threshT = settings$threshT # cjs: stephne # 200 #opt$threshold #25 #detection threshold (rfu)
fst = settings$fst #cjs: stephen # 0.01
nC = testFile$num_contributors #cjs: stephen # opt$unknowns #assumed number of contributors
dropin= settings$dropin # cjs: stephen # TRUE #opt$doDropin #consider drop-in model?
pC=0
if(dropin) {
if (testFile$num_replicates == 2) {
pC = 0.02
}
else if (testFile$num_replicates == 3) {
pC = 0.035
}
else {
stop('Bad number of replicates')
} # cjs: stephen
}
#Outfile to store results
setup <- paste0("T",threshT,"_fst",fst,"_pC",pC,"_C",nC)
outf <- paste0("output/", testFile$batch_dir, "/qualitative_", testFile$test_name,".csv") # cjs: stephen
cn=c("EvidFile","POI","log10LR")
out = matrix(nrow=0,ncol=length(cn))
colnames(out) = cn
# Loop over cases
begin=Sys.time() #start timer
evidfile = testFile$evidence_file # cjs: stephen # paste0(evidfold,"/",files[i]) #evidence files assumed to be looking in the evidfolder
mixData = sample_tableToList( X=tableReader( evidfile),threshT=threshT ) #get sample to analyse. NOTICE THAT THE PEAK HEIGHT THRESHOLD IS GIVEN AS ARGUMENT
for(j in 1:length(rN)) {
refData2 <- refData[j] #consider only ref "j" as POI
hpcond <- c(1) #Hp condition: ref i is contributor 1. This example only consider 1 reference profile. With x reference profiles this must be a x long vector.
hdcond <- c(0) #Hd condition: ref i is not-contributor .This example only consider 1 reference profile. With x reference profiles this must be a x long vector.
knownRefHd <- 1 #condition under Hd that ref i is a known non-contributors. This is a vector specifying which of the i-th references that are known non-contributors under hd.
#plotEPG(Data=mixData,kitname=kit,threshT=threshT,refcond=refData2,showPH=TRUE) #plotting evidence with ref
dat <- getData(mixData,refData2,popFreq) #process data for euroformix calculations (NOTICE THE CHANGE HERE OF NOT INCLUDING STUTTERS)
nS = length(dat$samples) #number of samples
#Perform calculatations
set.seed(1)
totAv <- sapply(dat$samples, function(x) sum(sapply(x,function(y) length(y$adata)))) #get number of alleles
refData3 <- list()
for(loc in names(dat$popFreq)) refData3[[loc]] <- lapply(refData2,function(x) x[[loc]]$adata) #get format for simDOdistr
dropqq <- c(0.05,0.95) #quantiles to estimate
diMd <- diMp <- rep(0,length(dropqq))#
niter = 1e4 #required number of samples
for(s in 1: nS) { #fix samples
dihd <- simDOdistr(totA=totAv[s],nC=nC,popFreq,refData=NULL,minS=niter, prC=pC,M=2000) #consider only model under Hd
dihp <- simDOdistr(totA=totAv[s],nC=nC,popFreq,refData=refData3,minS=niter, prC=pC,M=2000) #consider only model under Hd
diMd <- quantile(dihd,dropqq)/nS
diMp <- quantile(dihp,dropqq)/nS
}
div <- c(diMp,diMd)
LRmc <- Vectorize(calcLR)(div)
LR <- LRmc[which.min(LRmc)] #get conservative LR in LRmix
# update out object
out = rbind(out, c(testFile$evidence_name,rN[j],LR))
# Export overall results
write.table(out,file=outf,row.names=FALSE)
}
end=Sys.time() #end timer
runtime=difftime(end,begin) #Calculate the total running time:
paste("Time taken: ", sprintf("%.2fmin", runtime))