You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
355 lines
12 KiB
355 lines
12 KiB
#!/usr/bin/env Rscript
|
|
|
|
library("euroformix")
|
|
library("rjson")
|
|
|
|
# BEGIN: euroformix developer functions
|
|
# initialize helper functions
|
|
readFreq <- function(file) {
|
|
table <- read.table(file, header=TRUE, sep=",")
|
|
locs <- toupper(colnames(table[-1]))
|
|
popFreq <- list()
|
|
|
|
for (i in 1:length(locs)) {
|
|
freqs <- table[, i+1]
|
|
popFreq[[i]] <- table[!is.na(freqs), i+1]
|
|
names(popFreq[[i]]) <- table[!is.na(freqs), 1]
|
|
}
|
|
names(popFreq) <- locs
|
|
|
|
return(popFreq)
|
|
}
|
|
|
|
tableReader = function(filename) {
|
|
readTable = function(sep) {
|
|
read.table(filename, header=TRUE, sep=sep, stringsAsFactors=FALSE)
|
|
}
|
|
|
|
|
|
tab <- readTable("\t")
|
|
tryCatch({
|
|
if (ncol(tab) == 1) {
|
|
tab <- readTable(",")
|
|
}
|
|
}, error = function(e) {
|
|
e
|
|
})
|
|
|
|
tryCatch({
|
|
if (ncol(tab) == 1) {
|
|
tab <- readTable(";")
|
|
}
|
|
}, error = function(e) {
|
|
e
|
|
})
|
|
|
|
if (ncol(tab) == 1) {
|
|
tab <- readTable(";")
|
|
}
|
|
|
|
return(tab) # important: must return dataframe to keep allele-names correct!
|
|
}
|
|
|
|
sample_tableToList = function(X, threshT=NULL) {
|
|
cn = colnames(X) # colnames
|
|
|
|
lind = grep("marker", tolower(cn), fixed=TRUE) # locus col-ind
|
|
if (length(lind) == 0) {
|
|
lind = grep("loc", tolower(cn), fixed=TRUE) # try another name
|
|
}
|
|
sind = grep("sample",tolower(cn),fixed=TRUE) # sample col-ind
|
|
if (length(sind) > 1) {
|
|
sind = sind[grep("name", tolower(cn[sind]), fixed=TRUE)] # use only sample name
|
|
}
|
|
A_ind = grep("allele", tolower(cn), fixed=TRUE) # allele col-ind
|
|
H_ind = grep("height", tolower(cn), fixed=TRUE) # height col-ind
|
|
|
|
ln = unique(toupper(X[, lind])) # locus names: Convert to upper case
|
|
sn = unique(as.character(X[, sind])) # sample names
|
|
I = length(ln)
|
|
Y = list() # insert non-empty characters:
|
|
for (k in 1:length(sn)) { #for each sample in matrix
|
|
Y[[sn[k]]] = list() #one list for each sample
|
|
|
|
# for each locus
|
|
for (i in 1:I) {
|
|
xind = X[, sind] == sn[k] & toupper(X[, lind]) == ln[i] #get index in X for given sample and locus
|
|
|
|
if (sum(xind)==0) {
|
|
next
|
|
}
|
|
keep <- which(!is.na(X[xind, A_ind]) & X[xind, A_ind] != "")
|
|
|
|
# if peak heights are considered
|
|
if (length(H_ind) > 0) {
|
|
PH <- as.numeric(as.character(X[xind, H_ind][keep])) # get the peak heights
|
|
|
|
if (!is.null(threshT)) {
|
|
keep = which(PH >= threshT) # keep only alleles above thrshold (if given)
|
|
}
|
|
Y[[sn[k]]][[ln[i]]]$hdata = PH[keep]
|
|
}
|
|
|
|
if (length(A_ind) > 0) {
|
|
Y[[sn[k]]][[ln[i]]]$adata = as.character(X[xind, A_ind][keep])
|
|
}
|
|
}
|
|
}
|
|
names(Y) <- sn
|
|
|
|
return(Y)
|
|
}
|
|
|
|
# Helpfunction to get data to analyse
|
|
getData <- function(mixData2,tmpReference,popFreq) {
|
|
locs <- names(popFreq)
|
|
mixData <- lapply(mixData2, function(x) { return(x[locs]) }) # return selected loci
|
|
referenceData <- list()
|
|
for (loc in locs) {
|
|
referenceData[[loc]] <- lapply(tmpReference, function(x) { return(x[[loc]]$adata) }) # return selected loci
|
|
}
|
|
Qret <- Qassignate(samples=mixData, popFreq, referenceData, incS=FALSE, incR=FALSE) # NB: NOTICE THE CHANGE HERE OF inclS=FALSE even for stutter model (this has been updated in v2(
|
|
|
|
return(list(samples=mixData, referenceData=Qret$referenceData, popFreq=Qret$popFreq))
|
|
}
|
|
# END: euroformix developer functions
|
|
|
|
printf <- function(...) {
|
|
invisible(cat(sprintf(...)))
|
|
}
|
|
|
|
#' Read a JSON file containing environment variables for EuroForMix into a list.
|
|
#'
|
|
#' @param path character Path to JSON file containing EuroForMix's enviroment variables.
|
|
#'
|
|
#' @return list Unmarshalled environment variables.
|
|
#'
|
|
#' @examples
|
|
#'
|
|
#' loadEnv("path/to/settings.json")
|
|
#'
|
|
#' @export
|
|
loadEnv <- function(path) {
|
|
tryCatch({
|
|
env <- fromJSON(file=path)
|
|
}, error = function(e) {
|
|
# NOTE: trim whitespace on exception to have consistent output
|
|
printf("Error: Unable to read '%s': %s\n", path, trimws(e))
|
|
})
|
|
|
|
return(env)
|
|
}
|
|
|
|
printLabel <- function(label, var)
|
|
cat(paste0(" * ", label, ": ", var, '\n'))
|
|
|
|
#' Main-entry point to EuroForMix's multisample command-line interface.
|
|
#'
|
|
#' @param envFile character Path to a JSON file containing EuroForMix's environment variables.
|
|
#'
|
|
#' @return None
|
|
#'
|
|
#' @examples
|
|
#'
|
|
#' headless_efm('my_settings.json')
|
|
#'
|
|
#' @export
|
|
headless_efm <- function(evidenceFile, comparisonFile, settingsFile) {
|
|
# BEGIN: load input files into variables
|
|
|
|
settings = fromJSON(file=settingsFile)
|
|
|
|
# END: load input files into variables
|
|
|
|
# ----------------------------------------------------------------
|
|
|
|
# BEGIN: initialize environment
|
|
# set working directory where the rest of the paths can be relative
|
|
evidence <- fromJSON(file=evidenceFile)
|
|
comparison <- fromJSON(file=comparisonFile)
|
|
printf(" * Starting '%s' vs. '%s'\n", evidence$name, comparison$name)
|
|
|
|
databaseFile = settings$databaseFile # opt$database
|
|
popFreq <- readFreq(databaseFile) # import population freqs (allele frequency)
|
|
printLabel("DatabaseFile", databaseFile)
|
|
|
|
workingDir = settings$workingDir
|
|
printLabel("WorkingDir", workingDir)
|
|
setwd(workingDir)
|
|
caseTimeStart = Sys.time()
|
|
|
|
# get references
|
|
referenceFile <- settings$referenceFile # opt$ref - the file including references
|
|
referenceData = sample_tableToList(tableReader(referenceFile)) # load references
|
|
rN <- names(referenceData) # names of references
|
|
# END: initialize environment
|
|
|
|
# ----------------------------------------------------------------
|
|
|
|
# BEGIN: initialize model
|
|
nC = evidence$contributors # number of contributors
|
|
kit = settings$kit
|
|
threshT = settings$threshT #opt$threshold #25 #detection threshold (rfu)
|
|
fst = settings$fst
|
|
printLabel("Contributors", nC)
|
|
printLabel("Kit", kit)
|
|
printLabel("ThreshT", threshT)
|
|
printLabel("FST", fst)
|
|
|
|
# optimization and MCMC iterations
|
|
nDone = settings$nDone # number of required converged estimates
|
|
niter = settings$niter # number of iterations for LR sensitivity
|
|
printLabel("nDone", nDone)
|
|
printLabel("niter", niter)
|
|
|
|
stutter = settings$stutter # consider stutter in model?
|
|
xi = NULL # stutter parameter: Default is that it is estimated (NULL)
|
|
if (!stutter) {
|
|
xi = 0 # otherwise set as 0 => no stutter
|
|
}
|
|
printLabel("Sutter", stutter)
|
|
|
|
degrad = settings$degrad # consider degradation in model?
|
|
kit0 = NULL # default is that degradation is not estimated (NULL)
|
|
if (degrad) {
|
|
kit0 = kit # otherwise set as kit-short name
|
|
}
|
|
printLabel("Degrad", degrad)
|
|
|
|
dropin = settings$dropin # consider drop-in model?
|
|
pC <- lambda <- 0
|
|
if (dropin) {
|
|
pC <- 0.05 # dropin frequency.
|
|
lambda <- 0.01 # dropin peak height param
|
|
}
|
|
printLabel("Dropin", dropin)
|
|
|
|
# END: initialize model
|
|
# ----------------------------------------------------------------
|
|
# BEGIN: initialize input
|
|
|
|
|
|
# END: initialize input
|
|
# ----------------------------------------------------------------
|
|
# BEGIN: setup output
|
|
|
|
# output file to store results
|
|
setup <- paste0("T", threshT,
|
|
"_fst", fst,
|
|
"_pC", pC,
|
|
"_lam", lambda,
|
|
"_D", ifelse(degrad, 1,0),
|
|
"_S", ifelse(stutter, 1,0),
|
|
"_C", nC)
|
|
|
|
outf <- paste0("/home/stevie/Desktop/altar/.data/databases/my_vault/output/results_", setup, ".csv")
|
|
|
|
cn = c("EvidenceFile", "POI", "log10LR (ML)")
|
|
out = matrix(nrow=0, ncol=length(cn))
|
|
colnames(out) = cn
|
|
|
|
# END: setup output
|
|
# ----------------------------------------------------------------
|
|
# BEGIN: calculate LR
|
|
|
|
# DEBUG
|
|
evidenceFile <- "/home/stevie/Desktop/altar/resources/tutorialdata/stain.txt"
|
|
|
|
# get sample to analyse
|
|
# NOTE: notice that the peak height threshold is given as an argument
|
|
mixData = sample_tableToList(X=tableReader(evidenceFile), threshT=threshT)
|
|
for (j in 1:length(rN)) {
|
|
tmpReference <- referenceData[j] # consider only ref "i"
|
|
|
|
# hp condition: ref i is contributor 1. this example only considers 1
|
|
# reference profile. With x reference profiles this must be a x long vector
|
|
hpCond <- c(1)
|
|
|
|
# hd condition: ref i is not-contributor. this example only considers 1
|
|
# reference profile. with x reference profiles this must be a x long vector
|
|
hdCond <- c(0)
|
|
|
|
# condition under Hd that ref i is a known non-contributors. this is a vector
|
|
# specifying which of the i-th references that are known non-contributors under hd
|
|
knownRefHd <- 1
|
|
|
|
# plotEPG(Data=mixData, kitname=kit, threshT=threshT, refcond=tmpReference, showPH=TRUE) # plotting evidence with ref
|
|
|
|
# process data for euroformix calculations
|
|
# NOTE: notice the change here of not including stutters
|
|
dat <- getData(mixData, tmpReference, popFreq)
|
|
|
|
# perform calculatations
|
|
# calculate lr (mle based) using continuous model
|
|
set.seed(1)
|
|
hpfit <- contLikMLE(nC,
|
|
dat$samples,
|
|
popFreq=dat$popFreq,
|
|
threshT=threshT,
|
|
nDone=nDone,
|
|
xi=xi,
|
|
refData=dat$refData,
|
|
prC=pC,
|
|
lambda=lambda,
|
|
fst=fst,
|
|
condOrder=hpCond,
|
|
kit=kit0)
|
|
# print(hpfit$fit$thetahat2) # estimated parameters
|
|
# validMLEmodel(hpfit, kit=kit, plottitle="Hp") # valid model assumption under hp
|
|
|
|
set.seed(1)
|
|
hdfit <- contLikMLE(nC,
|
|
dat$samples,
|
|
popFreq=dat$popFreq,
|
|
threshT=threshT,
|
|
nDone=nDone,
|
|
xi=xi,
|
|
refData=dat$refData,
|
|
prC=pC,
|
|
lambda=lambda,
|
|
fst=fst,
|
|
condOrder=hdCond,
|
|
knownRef=knownRefHd,
|
|
kit=kit0)
|
|
# print(hdfit$fit$thetahat2) #estimated parameters
|
|
# validMLEmodel(hdfit, kit=kit, plottitle="Hd") # valid model assumption under hd
|
|
|
|
# WoE:
|
|
LRmle <- exp(hpfit$fit$loglik - hdfit$fit$loglik)
|
|
log10LRmle <- (hpfit$fit$loglik - hdfit$fit$loglik)/log(10)
|
|
|
|
# calculate CONS LR based on MCMC (5% of LR-sensitivity)
|
|
if (settings$conservativeLR) { # if you want you can allow this section to calculate your "conservative" LR estimate
|
|
qq0 <- 0.05 # quantile used as conservative estimate
|
|
delta = 10 # variance of the MCMC proposal distribution
|
|
set.seed(1)
|
|
mcmchp <- contLikMCMC(hpfit, niter=niter, delta=delta) # under hp
|
|
|
|
# validMCMC (mcmchp)
|
|
set.seed(1)
|
|
mcmchd <- contLikMCMC(hdfit, niter=niter, delta=delta) # under hd
|
|
|
|
# validMCMC (mcmchd)
|
|
log10LRmleCons <- quantile((mcmchp$postlogL-mcmchd$postlogL)/log(10), qq0)
|
|
}
|
|
|
|
# update out object
|
|
out = rbind(out, c(evidence$name, rN[j], log10LRmle)) #notice on log10 scale = Bins
|
|
|
|
# export overall results
|
|
write.table(out, file=outf, row.names=FALSE)
|
|
# end of calculation loop
|
|
} # end of evidence loop
|
|
|
|
caseTimeEnd = Sys.time()
|
|
caseRuntime = difftime(caseTimeEnd, caseTimeStart)
|
|
|
|
# TODO: generate case name
|
|
printf(" * Finished '%s' vs. '%s' (%.2f s)\n", evidence$name, comparison$name, caseRuntime)
|
|
|
|
# END: calculate lr
|
|
# ----------------------------------------------------------------
|
|
}
|
|
|
|
args = commandArgs(trailingOnly=TRUE)
|
|
headless_efm(args[1], args[2], "/home/stevie/Desktop/altar/resources/settings.json")
|