You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

355 lines
12 KiB

3 years ago
  1. #!/usr/bin/env Rscript
  2. library("euroformix")
  3. library("rjson")
  4. # BEGIN: euroformix developer functions
  5. # initialize helper functions
  6. readFreq <- function(file) {
  7. table <- read.table(file, header=TRUE, sep=",")
  8. locs <- toupper(colnames(table[-1]))
  9. popFreq <- list()
  10. for (i in 1:length(locs)) {
  11. freqs <- table[, i+1]
  12. popFreq[[i]] <- table[!is.na(freqs), i+1]
  13. names(popFreq[[i]]) <- table[!is.na(freqs), 1]
  14. }
  15. names(popFreq) <- locs
  16. return(popFreq)
  17. }
  18. tableReader = function(filename) {
  19. readTable = function(sep) {
  20. read.table(filename, header=TRUE, sep=sep, stringsAsFactors=FALSE)
  21. }
  22. tab <- readTable("\t")
  23. tryCatch({
  24. if (ncol(tab) == 1) {
  25. tab <- readTable(",")
  26. }
  27. }, error = function(e) {
  28. e
  29. })
  30. tryCatch({
  31. if (ncol(tab) == 1) {
  32. tab <- readTable(";")
  33. }
  34. }, error = function(e) {
  35. e
  36. })
  37. if (ncol(tab) == 1) {
  38. tab <- readTable(";")
  39. }
  40. return(tab) # important: must return dataframe to keep allele-names correct!
  41. }
  42. sample_tableToList = function(X, threshT=NULL) {
  43. cn = colnames(X) # colnames
  44. lind = grep("marker", tolower(cn), fixed=TRUE) # locus col-ind
  45. if (length(lind) == 0) {
  46. lind = grep("loc", tolower(cn), fixed=TRUE) # try another name
  47. }
  48. sind = grep("sample",tolower(cn),fixed=TRUE) # sample col-ind
  49. if (length(sind) > 1) {
  50. sind = sind[grep("name", tolower(cn[sind]), fixed=TRUE)] # use only sample name
  51. }
  52. A_ind = grep("allele", tolower(cn), fixed=TRUE) # allele col-ind
  53. H_ind = grep("height", tolower(cn), fixed=TRUE) # height col-ind
  54. ln = unique(toupper(X[, lind])) # locus names: Convert to upper case
  55. sn = unique(as.character(X[, sind])) # sample names
  56. I = length(ln)
  57. Y = list() # insert non-empty characters:
  58. for (k in 1:length(sn)) { #for each sample in matrix
  59. Y[[sn[k]]] = list() #one list for each sample
  60. # for each locus
  61. for (i in 1:I) {
  62. xind = X[, sind] == sn[k] & toupper(X[, lind]) == ln[i] #get index in X for given sample and locus
  63. if (sum(xind)==0) {
  64. next
  65. }
  66. keep <- which(!is.na(X[xind, A_ind]) & X[xind, A_ind] != "")
  67. # if peak heights are considered
  68. if (length(H_ind) > 0) {
  69. PH <- as.numeric(as.character(X[xind, H_ind][keep])) # get the peak heights
  70. if (!is.null(threshT)) {
  71. keep = which(PH >= threshT) # keep only alleles above thrshold (if given)
  72. }
  73. Y[[sn[k]]][[ln[i]]]$hdata = PH[keep]
  74. }
  75. if (length(A_ind) > 0) {
  76. Y[[sn[k]]][[ln[i]]]$adata = as.character(X[xind, A_ind][keep])
  77. }
  78. }
  79. }
  80. names(Y) <- sn
  81. return(Y)
  82. }
  83. # Helpfunction to get data to analyse
  84. getData <- function(mixData2,tmpReference,popFreq) {
  85. locs <- names(popFreq)
  86. mixData <- lapply(mixData2, function(x) { return(x[locs]) }) # return selected loci
  87. referenceData <- list()
  88. for (loc in locs) {
  89. referenceData[[loc]] <- lapply(tmpReference, function(x) { return(x[[loc]]$adata) }) # return selected loci
  90. }
  91. Qret <- Qassignate(samples=mixData, popFreq, referenceData, incS=FALSE, incR=FALSE) # NB: NOTICE THE CHANGE HERE OF inclS=FALSE even for stutter model (this has been updated in v2(
  92. return(list(samples=mixData, referenceData=Qret$referenceData, popFreq=Qret$popFreq))
  93. }
  94. # END: euroformix developer functions
  95. printf <- function(...) {
  96. invisible(cat(sprintf(...)))
  97. }
  98. #' Read a JSON file containing environment variables for EuroForMix into a list.
  99. #'
  100. #' @param path character Path to JSON file containing EuroForMix's enviroment variables.
  101. #'
  102. #' @return list Unmarshalled environment variables.
  103. #'
  104. #' @examples
  105. #'
  106. #' loadEnv("path/to/settings.json")
  107. #'
  108. #' @export
  109. loadEnv <- function(path) {
  110. tryCatch({
  111. env <- fromJSON(file=path)
  112. }, error = function(e) {
  113. # NOTE: trim whitespace on exception to have consistent output
  114. printf("Error: Unable to read '%s': %s\n", path, trimws(e))
  115. })
  116. return(env)
  117. }
  118. printLabel <- function(label, var)
  119. cat(paste0(" * ", label, ": ", var, '\n'))
  120. #' Main-entry point to EuroForMix's multisample command-line interface.
  121. #'
  122. #' @param envFile character Path to a JSON file containing EuroForMix's environment variables.
  123. #'
  124. #' @return None
  125. #'
  126. #' @examples
  127. #'
  128. #' headless_efm('my_settings.json')
  129. #'
  130. #' @export
  131. headless_efm <- function(evidenceFile, comparisonFile, settingsFile) {
  132. # BEGIN: load input files into variables
  133. settings = fromJSON(file=settingsFile)
  134. # END: load input files into variables
  135. # ----------------------------------------------------------------
  136. # BEGIN: initialize environment
  137. # set working directory where the rest of the paths can be relative
  138. evidence <- fromJSON(file=evidenceFile)
  139. comparison <- fromJSON(file=comparisonFile)
  140. printf(" * Starting '%s' vs. '%s'\n", evidence$name, comparison$name)
  141. databaseFile = settings$databaseFile # opt$database
  142. popFreq <- readFreq(databaseFile) # import population freqs (allele frequency)
  143. printLabel("DatabaseFile", databaseFile)
  144. workingDir = settings$workingDir
  145. printLabel("WorkingDir", workingDir)
  146. setwd(workingDir)
  147. caseTimeStart = Sys.time()
  148. # get references
  149. referenceFile <- settings$referenceFile # opt$ref - the file including references
  150. referenceData = sample_tableToList(tableReader(referenceFile)) # load references
  151. rN <- names(referenceData) # names of references
  152. # END: initialize environment
  153. # ----------------------------------------------------------------
  154. # BEGIN: initialize model
  155. nC = evidence$contributors # number of contributors
  156. kit = settings$kit
  157. threshT = settings$threshT #opt$threshold #25 #detection threshold (rfu)
  158. fst = settings$fst
  159. printLabel("Contributors", nC)
  160. printLabel("Kit", kit)
  161. printLabel("ThreshT", threshT)
  162. printLabel("FST", fst)
  163. # optimization and MCMC iterations
  164. nDone = settings$nDone # number of required converged estimates
  165. niter = settings$niter # number of iterations for LR sensitivity
  166. printLabel("nDone", nDone)
  167. printLabel("niter", niter)
  168. stutter = settings$stutter # consider stutter in model?
  169. xi = NULL # stutter parameter: Default is that it is estimated (NULL)
  170. if (!stutter) {
  171. xi = 0 # otherwise set as 0 => no stutter
  172. }
  173. printLabel("Sutter", stutter)
  174. degrad = settings$degrad # consider degradation in model?
  175. kit0 = NULL # default is that degradation is not estimated (NULL)
  176. if (degrad) {
  177. kit0 = kit # otherwise set as kit-short name
  178. }
  179. printLabel("Degrad", degrad)
  180. dropin = settings$dropin # consider drop-in model?
  181. pC <- lambda <- 0
  182. if (dropin) {
  183. pC <- 0.05 # dropin frequency.
  184. lambda <- 0.01 # dropin peak height param
  185. }
  186. printLabel("Dropin", dropin)
  187. # END: initialize model
  188. # ----------------------------------------------------------------
  189. # BEGIN: initialize input
  190. # END: initialize input
  191. # ----------------------------------------------------------------
  192. # BEGIN: setup output
  193. # output file to store results
  194. setup <- paste0("T", threshT,
  195. "_fst", fst,
  196. "_pC", pC,
  197. "_lam", lambda,
  198. "_D", ifelse(degrad, 1,0),
  199. "_S", ifelse(stutter, 1,0),
  200. "_C", nC)
  201. outf <- paste0("/home/stevie/Desktop/altar/.data/databases/my_vault/output/results_", setup, ".csv")
  202. cn = c("EvidenceFile", "POI", "log10LR (ML)")
  203. out = matrix(nrow=0, ncol=length(cn))
  204. colnames(out) = cn
  205. # END: setup output
  206. # ----------------------------------------------------------------
  207. # BEGIN: calculate LR
  208. # DEBUG
  209. evidenceFile <- "/home/stevie/Desktop/altar/resources/tutorialdata/stain.txt"
  210. # get sample to analyse
  211. # NOTE: notice that the peak height threshold is given as an argument
  212. mixData = sample_tableToList(X=tableReader(evidenceFile), threshT=threshT)
  213. for (j in 1:length(rN)) {
  214. tmpReference <- referenceData[j] # consider only ref "i"
  215. # hp condition: ref i is contributor 1. this example only considers 1
  216. # reference profile. With x reference profiles this must be a x long vector
  217. hpCond <- c(1)
  218. # hd condition: ref i is not-contributor. this example only considers 1
  219. # reference profile. with x reference profiles this must be a x long vector
  220. hdCond <- c(0)
  221. # condition under Hd that ref i is a known non-contributors. this is a vector
  222. # specifying which of the i-th references that are known non-contributors under hd
  223. knownRefHd <- 1
  224. # plotEPG(Data=mixData, kitname=kit, threshT=threshT, refcond=tmpReference, showPH=TRUE) # plotting evidence with ref
  225. # process data for euroformix calculations
  226. # NOTE: notice the change here of not including stutters
  227. dat <- getData(mixData, tmpReference, popFreq)
  228. # perform calculatations
  229. # calculate lr (mle based) using continuous model
  230. set.seed(1)
  231. hpfit <- contLikMLE(nC,
  232. dat$samples,
  233. popFreq=dat$popFreq,
  234. threshT=threshT,
  235. nDone=nDone,
  236. xi=xi,
  237. refData=dat$refData,
  238. prC=pC,
  239. lambda=lambda,
  240. fst=fst,
  241. condOrder=hpCond,
  242. kit=kit0)
  243. # print(hpfit$fit$thetahat2) # estimated parameters
  244. # validMLEmodel(hpfit, kit=kit, plottitle="Hp") # valid model assumption under hp
  245. set.seed(1)
  246. hdfit <- contLikMLE(nC,
  247. dat$samples,
  248. popFreq=dat$popFreq,
  249. threshT=threshT,
  250. nDone=nDone,
  251. xi=xi,
  252. refData=dat$refData,
  253. prC=pC,
  254. lambda=lambda,
  255. fst=fst,
  256. condOrder=hdCond,
  257. knownRef=knownRefHd,
  258. kit=kit0)
  259. # print(hdfit$fit$thetahat2) #estimated parameters
  260. # validMLEmodel(hdfit, kit=kit, plottitle="Hd") # valid model assumption under hd
  261. # WoE:
  262. LRmle <- exp(hpfit$fit$loglik - hdfit$fit$loglik)
  263. log10LRmle <- (hpfit$fit$loglik - hdfit$fit$loglik)/log(10)
  264. # calculate CONS LR based on MCMC (5% of LR-sensitivity)
  265. if (settings$conservativeLR) { # if you want you can allow this section to calculate your "conservative" LR estimate
  266. qq0 <- 0.05 # quantile used as conservative estimate
  267. delta = 10 # variance of the MCMC proposal distribution
  268. set.seed(1)
  269. mcmchp <- contLikMCMC(hpfit, niter=niter, delta=delta) # under hp
  270. # validMCMC (mcmchp)
  271. set.seed(1)
  272. mcmchd <- contLikMCMC(hdfit, niter=niter, delta=delta) # under hd
  273. # validMCMC (mcmchd)
  274. log10LRmleCons <- quantile((mcmchp$postlogL-mcmchd$postlogL)/log(10), qq0)
  275. }
  276. # update out object
  277. out = rbind(out, c(evidence$name, rN[j], log10LRmle)) #notice on log10 scale = Bins
  278. # export overall results
  279. write.table(out, file=outf, row.names=FALSE)
  280. # end of calculation loop
  281. } # end of evidence loop
  282. caseTimeEnd = Sys.time()
  283. caseRuntime = difftime(caseTimeEnd, caseTimeStart)
  284. # TODO: generate case name
  285. printf(" * Finished '%s' vs. '%s' (%.2f s)\n", evidence$name, comparison$name, caseRuntime)
  286. # END: calculate lr
  287. # ----------------------------------------------------------------
  288. }
  289. args = commandArgs(trailingOnly=TRUE)
  290. headless_efm(args[1], args[2], "/home/stevie/Desktop/altar/resources/settings.json")