# Copyright (c) 2014,
# Mathias Kuhring, KuhringM@rki.de, Robert Koch Institute, Germany, 
# All rights reserved. For details, please note the license.txt.

# imports the tmp features from the java routine
importFeatures <- function(assembly.files, quality.files, 
                           features.files, coverage.files, ccoverage.files){
  
  rawdata <- vector('list', length(features.files))
  if (DEBUGGING){ print("feature import") }
  for (i in 1:length(features.files)){
    if (DEBUGGING){ print(as.character(features.files[i])) }
    
    rawdata[[i]] <- cbind(Assembly=assembly.files[i],
                          ReadQuality=quality.files[i],
                          read.table(features.files[i], header=TRUE, 
                                     colClasses=get.colClasses(features.files[i])))
    rawdata[[i]]$Coverage <- read.coverage(coverage.files[i])
    rawdata[[i]]$CoreCoverage <- read.coverage(ccoverage.files[i])
  }
  if (DEBUGGING){ print("import done") }
  return(rawdata)
}


# Attach an expected genome size from egs to every data.frame in feature.list
attachEGS <- function(feature.list, egs){
  if (length(egs)==1){
    egs <- rep(egs, length(feature.list))
  }
  for (i in 1:length(feature.list)){
    attr(feature.list[[i]], "egs") <- egs[i]
  }
  return(feature.list)
}


# Extract further features, check if single or multi core should be used
extractFeatures <- function(rawdata, cores=1){
  if (DEBUGGING){ cat("feature extraction... ") }
  
  if (cores>1) results <- mcFeatures(rawdata, cores)
  else      results <- scFeatures(rawdata)
  
  if (DEBUGGING){ cat("done") }
  return(results)
}


# Singlecore feature extraction
scFeatures <- function(rawdata){
  results <- lapply(rawdata, function(x){ return(features(x)) })
  return(results)
}


# Multicore feature extraction
mcFeatures <- function(rawdata, cores){
#   require("parallel")
  results <- mclapply(rawdata, function(x){ return(features(x)) }, mc.cores=cores)
  return(results)
}


# Step by step feature extraction
features <- function(result){

  # Convert ContigID from factor to characters
  result$ContigID <- as.character(result$ContigID)  
  
  # GenomeSize & GenomeRelation
  egs = attr(result, "egs", exact=TRUE)
  if (DEBUGGING) print(paste("egs:", egs))
  if (egs == 0){
    result <- insert(result, sum(result$Length), 'EstimatedGenomeSize', 'ReadCount')    
  }
  else{
    result <- insert(result, egs, 'EstimatedGenomeSize', 'ReadCount')  
  }
  result <- insert(result, result$Length/result$EstimatedGenomeSize, 'GenomeRelation', 'EstimatedGenomeSize')
  
  # N50 & N50Relation
  result <- insert(result, N50(result$Length), 'N50', 'GenomeRelation')
  result <- insert(result, result$Length/result$N50, 'N50Relation', 'N50')
  
  # ReadLengthQuotients
  result <- insert(result, result$ReadLengthsMean/result$ReadLengthsPaddedMean, 'ReadLengthsQuotientMean', 'ReadLengthsPaddedMax')
  result <- insert(result, result$ReadLengthsMedian/result$ReadLengthsPaddedMedian, 'ReadLengthsQuotientMedian', 'ReadLengthsQuotientMean')
  result <- insert(result, result$ReadLengthsMin/result$ReadLengthsPaddedMin, 'ReadLengthsQuotientMin', 'ReadLengthsQuotientMedian')
  result <- insert(result, result$ReadLengthsMax/result$ReadLengthsPaddedMax, 'ReadLengthsQuotientMax', 'ReadLengthsQuotientMin')
  
  # ReadLengthQuotions Old: had to remove ReadLengthsQuotientSD and ReadLengthsQuotientMAD
  # They can become NAs if the ReadLengthsPaddesSD/MAD are all 0 (division by zero -> NA). 
  # This occures only if the PaddedReadLength of all reads of a contig is exactly the same (i.e. no gaps at all),
  # which is extremely unlikely in a real assembly. However, we observed data were this occures.
#   result <- insert(result, result$ReadLengthsMean/result$ReadLengthsPaddedMean, 'ReadLengthsQuotientMean', 'ReadLengthsPaddedMax')
#   result <- insert(result, result$ReadLengthsSD/result$ReadLengthsPaddedSD, 'ReadLengthsQuotientSD', 'ReadLengthsQuotientMean')
#   result <- insert(result, result$ReadLengthsMedian/result$ReadLengthsPaddedMedian, 'ReadLengthsQuotientMedian', 'ReadLengthsQuotientSD')
#   result <- insert(result, result$ReadLengthsMAD/result$ReadLengthsPaddedMAD, 'ReadLengthsQuotientMAD', 'ReadLengthsQuotientMedian')
#   result <- insert(result, result$ReadLengthsMin/result$ReadLengthsPaddedMin, 'ReadLengthsQuotientMin', 'ReadLengthsQuotientMAD')
#   result <- insert(result, result$ReadLengthsMax/result$ReadLengthsPaddedMax, 'ReadLengthsQuotientMax', 'ReadLengthsQuotientMin')
  
  # CoverageComparison
  result <- insert(result, result$CoverageGlobalMean/mean(result$CoverageGlobalMean),
                   'CoverageGlobalMeanComparison', 'CoverageMaxEndMax')
  result <- insert(result, result$CoverageMinEndMean/mean(result$CoverageMinEndMean),
                   'CoverageMinEndMeanComparison', 'CoverageGlobalMeanComparison')
  result <- insert(result, result$CoverageMaxEndMean/mean(result$CoverageMaxEndMean),
                   'CoverageMaxEndMeanComparison', 'CoverageMinEndMeanComparison')
  
  # BaseConfirmation
  conf <- apply(cbind(result$CoreCoverage, result$Coverage), 1, multibinom)
  result <- insert(result, sapply(conf, mean), 'BaseConfirmationGlobalMean', 'CoreCoverageMaxEndMax')
  result <- insert(result, sapply(conf, sd), 'BaseConfirmationGlobalSD', 'BaseConfirmationGlobalMean')
  result <- insert(result, sapply(conf, median), 'BaseConfirmationGlobalMedian', 'BaseConfirmationGlobalSD')
  result <- insert(result, sapply(conf, mad), 'BaseConfirmationGlobalMAD', 'BaseConfirmationGlobalMedian')
  result <- insert(result, sapply(conf, min), 'BaseConfirmationGlobalMin', 'BaseConfirmationGlobalMAD')
  result <- insert(result, sapply(conf, max), 'BaseConfirmationGlobalMax', 'BaseConfirmationGlobalMin')
  
  pos <- which(names(result)=='BaseConfirmationGlobalMax')
  endconf <- endConfirmation(result$Coverage, result$CoreCoverage, result$ReadLengthsMean)
  result <- cbind(result[1:pos], endconf, result[(pos+1):length(result)])
  
  # CoverageDrop
  result <- cbind(result, calcFeatureCoverageDrops(result$Coverage, result$ReadLengthsMean))
  
  # remove Coverega and CoreCoverage
  result$Coverage <- NULL
  result$CoreCoverage <- NULL
  
  return(result)
}


export <- function(results){
  for (i in 1:length(results)){
    filename <- paste("output", as.character(i), ".csv", sep="")
    write.table(results[[i]],file=filename,sep=",",row.names=F)
  }
}

featureReduction <- function(){
  featCor <- cor(training[,3:(length(training)-1)],training[,3:(length(training)-1)])
  index <- is.na(featCor)
  featCor2 <- replace(featCor, index, 0)
  #   corrgram(featCor2, type='cor')
  
  i=1
  while (i<=dim(featCor2)[1]){
    idx <- abs(featCor2[i,])<0.20
    idx[i] <- TRUE
    featCor2 <- featCor2[idx,idx]
    i <- i+1
  }
  
  featNames <- rownames(featCor2)
  featNames <- c(names(training)[1:2], featNames, names(training)[length(training)])
  return(featNames)
  
  featCor <- cor(training[,featNames],training[,featNames])
  index <- is.na(featCor)
  featCor2 <- replace(featCor, index, 0)
  corrgram(featCor2, type='cor')
}