You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

191 lines
8.2 KiB

  1. #!/usr/bin/env Rscript
  2. require(euroformix);# sessionInfo()
  3. # setwd("C:\\Users\\oyvbl\\Dropbox\\Forensic\\euroformix0\\runMultipleSamples") #IMPORTANT TO SET YOUR WORKDIRECTORY GIVEN AS SAME FOLDER AS YOUR FILES/EVIDENCE-FOLDERS # cjs: stephen
  4. #rm(list=ls())
  5. #source("lrmix_multisampleUsage.R")
  6. library(forensim)
  7. ################
  8. #help functions#
  9. ################
  10. readFreq <- function(file) { #import popfrequencies:
  11. table <- read.table(file,header=TRUE,sep=",")
  12. locs <- toupper(colnames(table[-1]))
  13. popFreq <- list()
  14. for(i in 1:length(locs)) {
  15. freqs <- table[,i+1]
  16. popFreq[[i]] <- table[!is.na(freqs),i+1]
  17. names(popFreq[[i]]) <- table[!is.na(freqs),1]
  18. }
  19. names(popFreq) <- locs
  20. return(popFreq)
  21. }
  22. tableReader=function(filename) {
  23. tab <- read.table(filename,header=TRUE,sep="\t",stringsAsFactors=FALSE)
  24. tryCatch( { if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=",",stringsAsFactors=FALSE) } ,error=function(e) e)
  25. tryCatch( { if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=";",stringsAsFactors=FALSE) } ,error=function(e) e)
  26. if(ncol(tab)==1) tab <- read.table(filename,header=TRUE,sep=";",stringsAsFactors=FALSE)
  27. return(tab) #need dataframe to keep allele-names correct!!
  28. }
  29. sample_tableToList = function(X,threshT=NULL) {
  30. cn = colnames(X) #colnames
  31. lind = grep("marker",tolower(cn),fixed=TRUE) #locus col-ind
  32. if(length(lind)==0) lind = grep("loc",tolower(cn),fixed=TRUE) #try another name
  33. sind = grep("sample",tolower(cn),fixed=TRUE) #sample col-ind
  34. if(length(sind)>1) sind = sind[grep("name",tolower(cn[sind]),fixed=TRUE)] #use only sample name
  35. A_ind = grep("allele",tolower(cn),fixed=TRUE) #allele col-ind
  36. H_ind = grep("height",tolower(cn),fixed=TRUE) #height col-ind
  37. ln = unique(toupper(X[,lind])) #locus names: Convert to upper case
  38. sn = unique(as.character(X[,sind])) #sample names
  39. I = length(ln)
  40. Y = list() #insert non-empty characters:
  41. for(k in 1:length(sn)) { #for each sample in matrix
  42. Y[[sn[k]]] = list() #one list for each sample
  43. for(i in 1:I) { #for each locus
  44. xind = X[,sind]==sn[k] & toupper(X[,lind])==ln[i] #get index in X for given sample and locus
  45. if(sum(xind)==0) next
  46. keep <- which(!is.na(X[xind,A_ind]) & X[xind,A_ind]!="")
  47. if(length(H_ind)>0) { #If peak heights are considered
  48. PH <- as.numeric(as.character(X[xind,H_ind][keep])) #get the peak heights
  49. if(!is.null(threshT)) keep = which(PH>=threshT) #keep only alleles above thrshold (if given)
  50. Y[[sn[k]]][[ln[i]]]$hdata = PH[keep]
  51. }
  52. if(length(A_ind)>0) {
  53. Y[[sn[k]]][[ln[i]]]$adata = as.character(X[xind,A_ind][keep])
  54. }
  55. }
  56. }
  57. names(Y) <- sn
  58. return(Y)
  59. }
  60. getData <- function(mixData2,refData2,popFreq) { #Helpfunction to get data to analyse
  61. locs <- names(popFreq)
  62. mixData <- lapply(mixData2,function(x) return(x[locs])) #return selected loci
  63. refData <- list()
  64. for(loc in locs) refData[[loc]] <- lapply(refData2,function(x) return(x[[loc]]$adata)) #return selected loci
  65. Qret <- Qassignate(samples=mixData, popFreq, refData,incS=FALSE,incR=FALSE) #NB: NOTICE THE CHANGE HERE OF inclS=FALSE even for stutter model (this has been updated in v2(
  66. return(list(samples=mixData,refData=Qret$refData,popFreq=Qret$popFreq))
  67. }
  68. calcLR <- function(pD) {
  69. LR<-1
  70. pDvec = rep(pD,nC)
  71. for(loc in names(dat$popFreq)) { #for each locus
  72. Ei <- NULL #get evidence
  73. for(ss in 1:length(dat$samples)) { #fix samples
  74. if(ss>1) Ei <- c(Ei,0) #seperate with 0
  75. adata <-dat$samples[[ss]][[loc]]$adata
  76. if(length(adata)==0) adata=0 #is empty
  77. Ei <- c(Ei,adata)
  78. }
  79. rdata <- dat$refData[[loc]] #reference data
  80. hpval <- likEvid( Ei,T=unlist(rdata),V=NULL,x=nC-1,theta=fst, prDHet=pDvec, prDHom=pDvec^2, prC=pC, freq=dat$popFreq[[loc]])
  81. hdval <- likEvid( Ei,T=NULL,V=unlist(rdata),x=nC,theta=fst, prDHet=pDvec, prDHom=pDvec^2, prC=pC, freq=dat$popFreq[[loc]])
  82. LR <- LR*hpval/hdval
  83. } #end for each markers
  84. return(LR)
  85. }
  86. ###################################################################
  87. #SCRIPT STARTS HERE:
  88. library("rjson") # cjs: stephen
  89. args = commandArgs(trailingOnly=TRUE) # cjs: stephen
  90. testFile = fromJSON(file=args[1]) # cjs: stephen
  91. settings = fromJSON(file=paste0(testFile$resource_dir, "/settings.json")) # cjs: stephen
  92. workingDir = testFile$working_dir # cjs: stephen
  93. setwd(workingDir) # cjs: stephen
  94. #get popfreq file:
  95. databaseFile = paste0(testFile$resource_dir, "/", settings$databaseFile) # opt$database # cjs: stephen
  96. #The allele frequency file
  97. popFreq <- readFreq(databaseFile) #import population freqs
  98. #names(popFreq) #loci to consider
  99. #Get evidences (files)
  100. # evidfold <- "evids" #opt$samples #The folder-name with files including evidence profiles # cjs: stephen
  101. # files = list.files(evidfold) # cjs: stephen
  102. #get references:
  103. refFile <- testFile$comparison_file # opt$ref #the file including references # cjs: stephen
  104. refData=sample_tableToList(tableReader(refFile)) #load references
  105. rN <- names(refData) #names of references
  106. #Model setup:
  107. kit = settings$kit
  108. threshT = settings$threshT # cjs: stephne # 200 #opt$threshold #25 #detection threshold (rfu)
  109. fst = settings$fst #cjs: stephen # 0.01
  110. nC = testFile$num_contributors #cjs: stephen # opt$unknowns #assumed number of contributors
  111. dropin= settings$dropin # cjs: stephen # TRUE #opt$doDropin #consider drop-in model?
  112. pC=0
  113. if(dropin) {
  114. if (testFile$num_replicates == 2) {
  115. pC = 0.02
  116. }
  117. else if (testFile$num_replicates == 3) {
  118. pC = 0.035
  119. }
  120. else {
  121. stop('Bad number of replicates')
  122. } # cjs: stephen
  123. }
  124. #Outfile to store results
  125. setup <- paste0("T",threshT,"_fst",fst,"_pC",pC,"_C",nC)
  126. outf <- paste0("output/", testFile$batch_dir, "/qualitative_", testFile$test_name,".csv") # cjs: stephen
  127. cn=c("EvidFile","POI","log10LR")
  128. out = matrix(nrow=0,ncol=length(cn))
  129. colnames(out) = cn
  130. # Loop over cases
  131. begin=Sys.time() #start timer
  132. evidfile = testFile$evidence_file # cjs: stephen # paste0(evidfold,"/",files[i]) #evidence files assumed to be looking in the evidfolder
  133. mixData = sample_tableToList( X=tableReader( evidfile),threshT=threshT ) #get sample to analyse. NOTICE THAT THE PEAK HEIGHT THRESHOLD IS GIVEN AS ARGUMENT
  134. for(j in 1:length(rN)) {
  135. refData2 <- refData[j] #consider only ref "j" as POI
  136. hpcond <- c(1) #Hp condition: ref i is contributor 1. This example only consider 1 reference profile. With x reference profiles this must be a x long vector.
  137. hdcond <- c(0) #Hd condition: ref i is not-contributor .This example only consider 1 reference profile. With x reference profiles this must be a x long vector.
  138. knownRefHd <- 1 #condition under Hd that ref i is a known non-contributors. This is a vector specifying which of the i-th references that are known non-contributors under hd.
  139. #plotEPG(Data=mixData,kitname=kit,threshT=threshT,refcond=refData2,showPH=TRUE) #plotting evidence with ref
  140. dat <- getData(mixData,refData2,popFreq) #process data for euroformix calculations (NOTICE THE CHANGE HERE OF NOT INCLUDING STUTTERS)
  141. nS = length(dat$samples) #number of samples
  142. #Perform calculatations
  143. set.seed(1)
  144. totAv <- sapply(dat$samples, function(x) sum(sapply(x,function(y) length(y$adata)))) #get number of alleles
  145. refData3 <- list()
  146. for(loc in names(dat$popFreq)) refData3[[loc]] <- lapply(refData2,function(x) x[[loc]]$adata) #get format for simDOdistr
  147. dropqq <- c(0.05,0.95) #quantiles to estimate
  148. diMd <- diMp <- rep(0,length(dropqq))#
  149. niter = 1e4 #required number of samples
  150. for(s in 1: nS) { #fix samples
  151. dihd <- simDOdistr(totA=totAv[s],nC=nC,popFreq,refData=NULL,minS=niter, prC=pC,M=2000) #consider only model under Hd
  152. dihp <- simDOdistr(totA=totAv[s],nC=nC,popFreq,refData=refData3,minS=niter, prC=pC,M=2000) #consider only model under Hd
  153. diMd <- quantile(dihd,dropqq)/nS
  154. diMp <- quantile(dihp,dropqq)/nS
  155. }
  156. div <- c(diMp,diMd)
  157. LRmc <- Vectorize(calcLR)(div)
  158. LR <- LRmc[which.min(LRmc)] #get conservative LR in LRmix
  159. # update out object
  160. out = rbind(out, c(testFile$evidence_name,rN[j],LR))
  161. # Export overall results
  162. write.table(out,file=outf,row.names=FALSE)
  163. }
  164. end=Sys.time() #end timer
  165. runtime=difftime(end,begin) #Calculate the total running time:
  166. paste("Time taken: ", sprintf("%.2fmin", runtime))