You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

355 lines
12 KiB

#!/usr/bin/env Rscript
library("euroformix")
library("rjson")
# BEGIN: euroformix developer functions
# initialize helper functions
readFreq <- function(file) {
table <- read.table(file, header=TRUE, sep=",")
locs <- toupper(colnames(table[-1]))
popFreq <- list()
for (i in 1:length(locs)) {
freqs <- table[, i+1]
popFreq[[i]] <- table[!is.na(freqs), i+1]
names(popFreq[[i]]) <- table[!is.na(freqs), 1]
}
names(popFreq) <- locs
return(popFreq)
}
tableReader = function(filename) {
readTable = function(sep) {
read.table(filename, header=TRUE, sep=sep, stringsAsFactors=FALSE)
}
tab <- readTable("\t")
tryCatch({
if (ncol(tab) == 1) {
tab <- readTable(",")
}
}, error = function(e) {
e
})
tryCatch({
if (ncol(tab) == 1) {
tab <- readTable(";")
}
}, error = function(e) {
e
})
if (ncol(tab) == 1) {
tab <- readTable(";")
}
return(tab) # important: must return dataframe to keep allele-names correct!
}
sample_tableToList = function(X, threshT=NULL) {
cn = colnames(X) # colnames
lind = grep("marker", tolower(cn), fixed=TRUE) # locus col-ind
if (length(lind) == 0) {
lind = grep("loc", tolower(cn), fixed=TRUE) # try another name
}
sind = grep("sample",tolower(cn),fixed=TRUE) # sample col-ind
if (length(sind) > 1) {
sind = sind[grep("name", tolower(cn[sind]), fixed=TRUE)] # use only sample name
}
A_ind = grep("allele", tolower(cn), fixed=TRUE) # allele col-ind
H_ind = grep("height", tolower(cn), fixed=TRUE) # height col-ind
ln = unique(toupper(X[, lind])) # locus names: Convert to upper case
sn = unique(as.character(X[, sind])) # sample names
I = length(ln)
Y = list() # insert non-empty characters:
for (k in 1:length(sn)) { #for each sample in matrix
Y[[sn[k]]] = list() #one list for each sample
# for each locus
for (i in 1:I) {
xind = X[, sind] == sn[k] & toupper(X[, lind]) == ln[i] #get index in X for given sample and locus
if (sum(xind)==0) {
next
}
keep <- which(!is.na(X[xind, A_ind]) & X[xind, A_ind] != "")
# if peak heights are considered
if (length(H_ind) > 0) {
PH <- as.numeric(as.character(X[xind, H_ind][keep])) # get the peak heights
if (!is.null(threshT)) {
keep = which(PH >= threshT) # keep only alleles above thrshold (if given)
}
Y[[sn[k]]][[ln[i]]]$hdata = PH[keep]
}
if (length(A_ind) > 0) {
Y[[sn[k]]][[ln[i]]]$adata = as.character(X[xind, A_ind][keep])
}
}
}
names(Y) <- sn
return(Y)
}
# Helpfunction to get data to analyse
getData <- function(mixData2,tmpReference,popFreq) {
locs <- names(popFreq)
mixData <- lapply(mixData2, function(x) { return(x[locs]) }) # return selected loci
referenceData <- list()
for (loc in locs) {
referenceData[[loc]] <- lapply(tmpReference, function(x) { return(x[[loc]]$adata) }) # return selected loci
}
Qret <- Qassignate(samples=mixData, popFreq, referenceData, incS=FALSE, incR=FALSE) # NB: NOTICE THE CHANGE HERE OF inclS=FALSE even for stutter model (this has been updated in v2(
return(list(samples=mixData, referenceData=Qret$referenceData, popFreq=Qret$popFreq))
}
# END: euroformix developer functions
printf <- function(...) {
invisible(cat(sprintf(...)))
}
#' Read a JSON file containing environment variables for EuroForMix into a list.
#'
#' @param path character Path to JSON file containing EuroForMix's enviroment variables.
#'
#' @return list Unmarshalled environment variables.
#'
#' @examples
#'
#' loadEnv("path/to/settings.json")
#'
#' @export
loadEnv <- function(path) {
tryCatch({
env <- fromJSON(file=path)
}, error = function(e) {
# NOTE: trim whitespace on exception to have consistent output
printf("Error: Unable to read '%s': %s\n", path, trimws(e))
})
return(env)
}
printLabel <- function(label, var)
cat(paste0(" * ", label, ": ", var, '\n'))
#' Main-entry point to EuroForMix's multisample command-line interface.
#'
#' @param envFile character Path to a JSON file containing EuroForMix's environment variables.
#'
#' @return None
#'
#' @examples
#'
#' headless_efm('my_settings.json')
#'
#' @export
headless_efm <- function(evidenceFile, comparisonFile, settingsFile) {
# BEGIN: load input files into variables
settings = fromJSON(file=settingsFile)
# END: load input files into variables
# ----------------------------------------------------------------
# BEGIN: initialize environment
# set working directory where the rest of the paths can be relative
evidence <- fromJSON(file=evidenceFile)
comparison <- fromJSON(file=comparisonFile)
printf(" * Starting '%s' vs. '%s'\n", evidence$name, comparison$name)
databaseFile = settings$databaseFile # opt$database
popFreq <- readFreq(databaseFile) # import population freqs (allele frequency)
printLabel("DatabaseFile", databaseFile)
workingDir = settings$workingDir
printLabel("WorkingDir", workingDir)
setwd(workingDir)
caseTimeStart = Sys.time()
# get references
referenceFile <- settings$referenceFile # opt$ref - the file including references
referenceData = sample_tableToList(tableReader(referenceFile)) # load references
rN <- names(referenceData) # names of references
# END: initialize environment
# ----------------------------------------------------------------
# BEGIN: initialize model
nC = evidence$contributors # number of contributors
kit = settings$kit
threshT = settings$threshT #opt$threshold #25 #detection threshold (rfu)
fst = settings$fst
printLabel("Contributors", nC)
printLabel("Kit", kit)
printLabel("ThreshT", threshT)
printLabel("FST", fst)
# optimization and MCMC iterations
nDone = settings$nDone # number of required converged estimates
niter = settings$niter # number of iterations for LR sensitivity
printLabel("nDone", nDone)
printLabel("niter", niter)
stutter = settings$stutter # consider stutter in model?
xi = NULL # stutter parameter: Default is that it is estimated (NULL)
if (!stutter) {
xi = 0 # otherwise set as 0 => no stutter
}
printLabel("Sutter", stutter)
degrad = settings$degrad # consider degradation in model?
kit0 = NULL # default is that degradation is not estimated (NULL)
if (degrad) {
kit0 = kit # otherwise set as kit-short name
}
printLabel("Degrad", degrad)
dropin = settings$dropin # consider drop-in model?
pC <- lambda <- 0
if (dropin) {
pC <- 0.05 # dropin frequency.
lambda <- 0.01 # dropin peak height param
}
printLabel("Dropin", dropin)
# END: initialize model
# ----------------------------------------------------------------
# BEGIN: initialize input
# END: initialize input
# ----------------------------------------------------------------
# BEGIN: setup output
# output file to store results
setup <- paste0("T", threshT,
"_fst", fst,
"_pC", pC,
"_lam", lambda,
"_D", ifelse(degrad, 1,0),
"_S", ifelse(stutter, 1,0),
"_C", nC)
outf <- paste0("/home/stevie/Desktop/altar/.data/databases/my_vault/output/results_", setup, ".csv")
cn = c("EvidenceFile", "POI", "log10LR (ML)")
out = matrix(nrow=0, ncol=length(cn))
colnames(out) = cn
# END: setup output
# ----------------------------------------------------------------
# BEGIN: calculate LR
# DEBUG
evidenceFile <- "/home/stevie/Desktop/altar/resources/tutorialdata/stain.txt"
# get sample to analyse
# NOTE: notice that the peak height threshold is given as an argument
mixData = sample_tableToList(X=tableReader(evidenceFile), threshT=threshT)
for (j in 1:length(rN)) {
tmpReference <- referenceData[j] # consider only ref "i"
# hp condition: ref i is contributor 1. this example only considers 1
# reference profile. With x reference profiles this must be a x long vector
hpCond <- c(1)
# hd condition: ref i is not-contributor. this example only considers 1
# reference profile. with x reference profiles this must be a x long vector
hdCond <- c(0)
# condition under Hd that ref i is a known non-contributors. this is a vector
# specifying which of the i-th references that are known non-contributors under hd
knownRefHd <- 1
# plotEPG(Data=mixData, kitname=kit, threshT=threshT, refcond=tmpReference, showPH=TRUE) # plotting evidence with ref
# process data for euroformix calculations
# NOTE: notice the change here of not including stutters
dat <- getData(mixData, tmpReference, popFreq)
# perform calculatations
# calculate lr (mle based) using continuous model
set.seed(1)
hpfit <- contLikMLE(nC,
dat$samples,
popFreq=dat$popFreq,
threshT=threshT,
nDone=nDone,
xi=xi,
refData=dat$refData,
prC=pC,
lambda=lambda,
fst=fst,
condOrder=hpCond,
kit=kit0)
# print(hpfit$fit$thetahat2) # estimated parameters
# validMLEmodel(hpfit, kit=kit, plottitle="Hp") # valid model assumption under hp
set.seed(1)
hdfit <- contLikMLE(nC,
dat$samples,
popFreq=dat$popFreq,
threshT=threshT,
nDone=nDone,
xi=xi,
refData=dat$refData,
prC=pC,
lambda=lambda,
fst=fst,
condOrder=hdCond,
knownRef=knownRefHd,
kit=kit0)
# print(hdfit$fit$thetahat2) #estimated parameters
# validMLEmodel(hdfit, kit=kit, plottitle="Hd") # valid model assumption under hd
# WoE:
LRmle <- exp(hpfit$fit$loglik - hdfit$fit$loglik)
log10LRmle <- (hpfit$fit$loglik - hdfit$fit$loglik)/log(10)
# calculate CONS LR based on MCMC (5% of LR-sensitivity)
if (settings$conservativeLR) { # if you want you can allow this section to calculate your "conservative" LR estimate
qq0 <- 0.05 # quantile used as conservative estimate
delta = 10 # variance of the MCMC proposal distribution
set.seed(1)
mcmchp <- contLikMCMC(hpfit, niter=niter, delta=delta) # under hp
# validMCMC (mcmchp)
set.seed(1)
mcmchd <- contLikMCMC(hdfit, niter=niter, delta=delta) # under hd
# validMCMC (mcmchd)
log10LRmleCons <- quantile((mcmchp$postlogL-mcmchd$postlogL)/log(10), qq0)
}
# update out object
out = rbind(out, c(evidence$name, rN[j], log10LRmle)) #notice on log10 scale = Bins
# export overall results
write.table(out, file=outf, row.names=FALSE)
# end of calculation loop
} # end of evidence loop
caseTimeEnd = Sys.time()
caseRuntime = difftime(caseTimeEnd, caseTimeStart)
# TODO: generate case name
printf(" * Finished '%s' vs. '%s' (%.2f s)\n", evidence$name, comparison$name, caseRuntime)
# END: calculate lr
# ----------------------------------------------------------------
}
args = commandArgs(trailingOnly=TRUE)
headless_efm(args[1], args[2], "/home/stevie/Desktop/altar/resources/settings.json")