diff --git a/source/skeleton.R b/source/skeleton.R index 73f8f229da5f0e84c27abb92f4e8c87c22410b12..1fcc6e58ffe0c58628c0aac1b8cbe61dbb1d07dc 100644 --- a/source/skeleton.R +++ b/source/skeleton.R @@ -374,7 +374,7 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be # Extract sizes of every partitions sizes <- unlist(lapply(parts, length)) - cat("Partition have sizes of: ", sizes, "\n") + #cat("Partition have sizes of: ", sizes, "\n") # Extract every partition's GE,PK,SK parts @@ -401,26 +401,26 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be SK_in[[i-1]] <- subSK remove(subSK) } - cat("PK sum: ", unlist(lapply(PK_in, sum)), "\n") + #cat("PK sum: ", unlist(lapply(PK_in, sum)), "\n") # prepare BGe score parameters # this is a large computation that computes a large parameter matrix: # N x N, where N is the sum of sizes, e.g., total number of genes - cat("Computing large BGe TN matrix...\n") + #cat("Computing large BGe TN matrix...\n") myScore <- scoreparameters_BiDAG_BGe(n = ncol(GE), data = GE) param_in <- list(n=myScore$n, awpN=myScore$awpN, TN=myScore$TN, scoreconstvec=myScore$scoreconstvec) remove(myScore) #saveRDS(param_in, "param.RData") #param_in <- readRDS("param.RData") - cat("TN matrix have a size of: ", dim(param_in$TN), "\n") + #cat("TN matrix have a size of: ", dim(param_in$TN), "\n") ref_score <- score_G(PK_ref, PK_in, beta_L, beta_H, param_in) - cat("Score of prior matrix: \n") + #cat("Score of prior matrix: \n") print(ref_score$pG + ref_score$pDG) #remove(ref_score) #remove(PK_ref) - cat("Running the SK-SS Rcpp part:\n") + #cat("Running the SK-SS Rcpp part:\n") G_out <- c_SK_SS(PK_in,SK_in, sizes, param_in, I, debug, beta_L, beta_H, score_G, score_change_G) @@ -433,5 +433,7 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be PK_out <- exp(PK_out) PK_out <- PK_out >= alpha + saveRDS(PK_out, "SKSS_output.RDS") + return(list(PK_out=PK_out, PK_score=ref_score$pG + ref_score$pDG)) } diff --git a/source/skeleton.cpp b/source/skeleton.cpp index a42a04dd2b321bb1e28c505be02aabcba180be7d..30da229b41e686494b007f699df6528f1b6b33de 100644 --- a/source/skeleton.cpp +++ b/source/skeleton.cpp @@ -399,7 +399,7 @@ bool CompressedTrie::appendRecursive(key_tp* word, uint32_t len) { bool ret = cand->is_graph; cand->is_graph = true; - return ret; + return !ret; } // go deeper @@ -692,6 +692,12 @@ public: const List* SK_list = this->input_data.SK; List* ret = new List(SK_list->length()); + if(this->logSumScores == RNegInf) + { + this->logSumScores = 0; + } + + for(uint32_t i = 0; i < SK_list->length(); i++) { /* @@ -1102,6 +1108,7 @@ protected: void SaveGraph() { + return; // Save intermediate results //Rprintf("Saving intermediate results [%9.6f]\n", this->logSumScores); @@ -1265,15 +1272,43 @@ List c_SK_SS( //#include "skeleton.h" // [[Rcpp::export]] -int test(const List& SK, const List& PK, double val) +int test() { - + CompressedTrie V_G; + + std::vector<key_tp> word; + word.push_back( + std::make_tuple((uint8_t)0, (uint16_t)0, (uint16_t)2) + ); + + bool ret = V_G.AppendChain(word); + Rcout << "ret: " << ret << "\n"; + + word.push_back( + std::make_tuple((uint8_t)0, (uint16_t)0, (uint16_t)3) + ); + + ret = V_G.AppendChain(word); + Rcout << "ret: " << ret << "\n"; + + + + + bool ret2 = V_G.AppendChain(word); + Rcout << "ret2: " << ret2 << "\n"; + word.erase(word.begin() + 0); + ret2 = V_G.AppendChain(word); + Rcout << "ret2: " << ret2 << "\n"; + ret2 = V_G.AppendChain(word); + Rcout << "ret2: " << ret2 << "\n"; + return -1; + /* uint32_t N = PK.length(); uint32_t countOnes = 0; @@ -1295,7 +1330,7 @@ int test(const List& SK, const List& PK, double val) } } } - return countOnes; + return countOnes;*/ } diff --git a/source/synthetic.R b/source/synthetic.R index 34145ff797e3daf5128791015bcd8ae692a2c565..2cebb442cb522bef8663da6e9cb393c16960dc69 100644 --- a/source/synthetic.R +++ b/source/synthetic.R @@ -26,6 +26,7 @@ softmax <- function(v) return(ret) } + compare_methods <- function(input_type="sigmoid", only_allowed=FALSE, add_random_reference=FALSE) { library(caret) @@ -35,17 +36,18 @@ compare_methods <- function(input_type="sigmoid", only_allowed=FALSE, add_random library(RColorBrewer) library(ggnewscale) library(ggrepel) + library(bnlearn) if(input_type == "sigmoid") { - folder <- "~/Desktop/pim-sk-ss/results/MAP/" + folder <- "~/Desktop/pim-sk-ss/results/COMPARE/algos/" } else{ folder <- "~/Desktop/pim-sk-ss/results/SK/SK_algos/" } data_fname <- "data.RDS" - data <- readRDS("~/Desktop/pim-sk-ss/results/MAP/data.RDS") + data <- readRDS("~/Desktop/pim-sk-ss/results/COMPARE/data.RDS") parts <- data$partition to_test <- list.files(folder) @@ -57,21 +59,52 @@ compare_methods <- function(input_type="sigmoid", only_allowed=FALSE, add_random myColors <- rep(myColors, 10) mets <- c("cor", "mc-cor", "smc-cor", "zf", "mc-zf", "smc-zf", "mi-g", "mc-mi-g", "smc-mi-g", "mi-g-sh") - + Rr <- data$Real_adjacency if(only_allowed) { - real <- c(data$Real_adjacency[parts[[1]], parts[[2]]], - data$Real_adjacency[parts[[2]], parts[[3]]]) + storage.mode(Rr) <- "numeric" + real <- c(Rr[parts[[1]], parts[[2]]], + Rr[parts[[2]], parts[[3]]]) } else{ - real <- unlist(c(data$Real_adjacency)) + real <- unlist(c(Rr)) } - real <- factor( real, levels = c(0, 1)) - df_res <- data.frame(matrix(0, nrow=0, ncol=7)) + + + + + + PK <- data$PK_adjacency + storage.mode(PK) <- "numeric" + if(only_allowed) + { + pred <- c(unlist(PK[parts[[1]], parts[[2]]]), unlist(PK[parts[[2]], parts[[3]]])) + } + else + { + pred <- unlist(c(PK)) + } + pred <- factor( + pred, + levels = c(0, 1)) + confMat <- caret::confusionMatrix(pred,real) + TPR <- confMat$table[1,2] + FPR <- confMat$table[2,1] + acc <- confMat$table[1,2] + confMat$table[2,1] + + algo <- "Prior" + met <- "Prior matrix" + df_res <- rbind(df_res, c(TPR, FPR,acc,algo,met, "-", nrow(data$SigmoidData))) + + + + + + for(fname in to_test) { if(fname == data_fname) @@ -80,8 +113,19 @@ compare_methods <- function(input_type="sigmoid", only_allowed=FALSE, add_random } adj <- readRDS(paste(folder, fname, sep="")) - #SK <- amat(adj) + if(all(class(adj) == "bn")) + { + adj <- bnlearn::amat(adj) + } + else{ + asd <- 1 + } + SK <- adj + SK <- as.matrix(SK) + storage.mode(SK) <- "numeric" + + if(nrow(SK) != length(unlist(data$partition))) { N <- length(unlist(data$partition)) @@ -141,10 +185,6 @@ compare_methods <- function(input_type="sigmoid", only_allowed=FALSE, add_random acc <- (confMat$table[1,1] + confMat$table[2,2]) / sum(c(confMat$table)) acc <- confMat$table[1,2] + confMat$table[2,1] - if(acc > 50) - { - next - } algo <- "random" @@ -322,9 +362,11 @@ find_closest <- function(GE) # miRNA iteration expr <- G * X # take expr values without any parents expr_mi <- expr[,c(partitions[[2]])] - expr_mi <- apply(expr_mi, 2, function(cl) c(0, cl[cl != 0])) + expr_mi <- apply(expr_mi, 2, function(cl) c(1, cl[cl != 0]), simplify=F) param_mi <- params[c(partitions[[2]])] + #param_mi <- do.call(cbind, param_mi) X_mi <- Map('*', expr_mi, param_mi) + #X_mi <- expr_mi * param_mi X_mi <- lapply(X_mi, sum) X[c(partitions[[2]])] <- X[c(partitions[[2]])] + unlist(X_mi) # mRNA iteration @@ -380,11 +422,11 @@ generate_data <- function(N_samples=100, p_rand = 0.05, prior_FP = 0.20, prior_FN = 0.20, - epsilon_std = 0.1, + epsilon_std = 0.01, beta_zero_mu=-11.2, beta_zero_std=0.05, beta_mu=0.0, # these parameters simulate the range of values: (0, 19) with mean=13 of TPM MDS data - beta_std=0.2, # 0.2 + beta_std=0.4, # 0.2 type="gaussian" ) { @@ -400,13 +442,13 @@ generate_data <- function(N_samples=100, # CircRNA - miRNA mat_circ_mi <- matrix(0, n_circ, n_mirna) N_circ_mi <- n_circ * n_mirna - circ_mi_indices <- sample(N_circ_mi, N_circ_mi * p_rand) + circ_mi_indices <- sample(N_circ_mi, N_circ_mi * p_rand + 1) mat_circ_mi[circ_mi_indices] <- 1 Real_adjacency[1:n_circ, (n_circ + 1):(n_circ + n_mirna)] <- mat_circ_mi # miRNA - mRNA mat_mi_m <- matrix(0, n_mirna, n_mrna) N_mi_m <- n_mirna * n_mrna - mi_m_indices <- sample(N_mi_m, N_mi_m * p_rand) + mi_m_indices <- sample(N_mi_m, N_mi_m * p_rand + 1) mat_mi_m[mi_m_indices] <- 1 Real_adjacency[(n_circ + 1):(n_circ + n_mirna), (n_circ + n_mirna + 1):(N)] <- mat_mi_m # Set fake names diff --git a/source/testing.R b/source/testing.R new file mode 100644 index 0000000000000000000000000000000000000000..2d21725a2dfca64bdf1fce264408f1afbb3833c9 --- /dev/null +++ b/source/testing.R @@ -0,0 +1,198 @@ + + + +library(gtools) +library(plyr) +library(matrixStats) + +source("source/BiDAG_BGe.R") + +.generate_all_subsets <- function(Nrows, Ncols) +{ + + lin_vals <- gtools::permutations(2, Nrows*Ncols, c(T,F), repeats.allowed = TRUE) + + ret <- plyr::alply(lin_vals, 1, function(rw){ + ret <- matrix(rw, Nrows, Ncols) + return(ret) + }) + return(ret) +} +.generate_all_subsets_per_partition <- function(parts) +{ + N <- 0 + for(i in 2:length(parts)) + { + N <- N + length(parts[[i-1]])*length(parts[[i]]) + } + + combo_matrices <- gtools::permutations(2, N, c(T,F), repeats.allowed = TRUE) + combo_matrices <- plyr::alply(combo_matrices, 1, function(rw){ + return(rw) + }) + + + retmat <- NA + offset <- 0 + for(i in 2:length(parts)) + { + Nr <- length(parts[[i-1]]) + Nc <- length(parts[[i]]) + + mats <- lapply(combo_matrices, function(m) { + linmat <- m[offset + 1:(Nc*Nr)] + ret <- matrix(linmat, nrow=Nr, ncol=Nc) + rownames(ret) <- parts[[i-1]] + colnames(ret) <- parts[[i]] + return(ret) + }) + + if(i == 2) + { + retmat <- mats + } + else if(i == 3) + { + retmat <- mapply(list, retmat, mats, SIMPLIFY = F) + } + else + { + retmat <- mapply(append, retmat, mats, SIMPLIFY = F) + } + + offset <- offset + Nc * Nr + } + return(retmat) +} + +init_score_G_BN <- function(data, beta_L=0.1, beta_H=10.5) +{ + GE <- data$SigmoidData + PK <- data$PK_adjacency + parts <- data$partition + real_adj <- data$Real_adjacency + + # Remove zero rows + zeroGenes <- names(which(colSums(GE) == 0)) + if(length(zeroGenes) > 0) + { + for(i in 1:length(parts)) + { + prev <- parts[[i]] + parts[[i]] <- prev[!(prev %in% zeroGenes)] + } + } + + # Extract sizes of every partitions + sizes <- unlist(lapply(parts, length)) + + # Extract every partition's GE,PK,SK parts + PK_in <- list() + Real_in <- list() + for(i in 2:length(parts)) + { + # PK + subPK <- PK[parts[[i-1]],parts[[i]]] + subPK <- as.matrix(subPK) + # convert PK to prior scores as in IntOMICS/Bayesian Network Prior(Isci) + # subPK <- subPK * 0.5 + 0.5 # ---> 1 = prior, 0.5 = no prior, but allowed edge + # all non-allowed edges are removed explicitly by list + + PK_in[[i-1]] <- subPK + remove(subPK) + + subReal <- real_adj[parts[[i-1]],parts[[i]]] + Real_in[[i-1]] <- subReal + remove(subReal) + } + + param_bge <- scoreparameters_BiDAG_BGe(n=ncol(data$SigmoidData), data = data$SigmoidData) + + param <- list(bge=param_bge, prior=PK_in, parts=parts, beta_L=beta_L, beta_H=beta_H, real_adj=Real_in) + + return(param) +} + + + + + +comp_score_G_BN <- function(G, param) +{ + param_bge <- param$bge + PK <- param$prior + K <- sum(unlist(lapply(PK, sum))) + pDG <- 0 + energy <- 0 + for(i in 1:length(G)) # for each partition interaction separately + { + adj <- G[[i]] + bpk <- PK[[i]] + pDG <- pDG + BGe_compute(adj, param_bge) + energy <- energy + sum(bpk * (1 - adj)) + } + beta <- optimum_beta(energy, K, param$beta_L, param$beta_H) + pG <- fast_point_log_P_G(beta, K, energy) + return(pG + pDG) +} + +test_all_structures <- function( + data, init_score_G=init_score_G_BN, comp_score_G=comp_score_G_BN, num_iter=64 + ) +{ + # Compute the parameters of a given score + param <- init_score_G(data) + + # Compute the score of a true adjacency matrix + score_real <- comp_score_G(param$real_adj, param) + + + combos <- .generate_all_subsets_per_partition(data$partition) + scores <- lapply(combos, function(G) + comp_score_G(G, param) + ) + + cat("Best score: ", max(scores), ", real network: ", score_real, "\n") + + N <- length(combos) + ids <- sample(N, num_iter * ncol(data$SigmoidData)) + + combos <- combos[ids] + scores <- scores[ids] + + + map_est <- which.max(scores) + map_est <- combos[[map_est]] + + + posteriors <- mapply(function(g, i) { + lret <- mapply("*", g, i) + lret <- mapply(function(m) { + m[m == 0] <- -Inf + m + }, lret) + lret + }, + combos, scores, SIMPLIFY = F) + + posterior <- sapply(2:length(data$partition), function(id) + { + gs <- lapply(posteriors, function(le) le[[id-1]]) + + sum <- Reduce(function(g1, g2){ + # logsumexp for two matrices + lret <- rowLogSumExps(cbind(c(g1), c(g2))) + lret <- matrix(lret, nrow=nrow(g1), ncol=ncol(g1)) + lret + } , gs) + return(sum) + }) + sum_scores <- Reduce(function(i,j) logSumExp(c(i,j)), scores) + + + posterior <- mapply("-", posterior, sum_scores) + posterior <- mapply(exp, posterior) + posterior <- mapply(function(m) m > 0.5, posterior) + + return(list(map=map_est, posterior=posterior)) +} \ No newline at end of file diff --git a/workflow.Rmd b/workflow.Rmd index a94ef300f0382d40af0b9f6e86a2b7516b8350ec..513e68146076434d2c43629fd427ae2aa517b702 100644 --- a/workflow.Rmd +++ b/workflow.Rmd @@ -630,8 +630,8 @@ remove(bigGE) ```{r} source("source/synthetic.R") -data <- generate_data(N_samples = 100, prior_FP=0.2, prior_FN=0.5, - n_circ = 200, n_mirna = 200, n_mrna = 1000) +data <- generate_data(N_samples = 100, prior_FP=0.5, prior_FN=0.5, p_rand = 0.1, + n_circ = 750, n_mirna = 250, n_mrna = 2500, beta_std = 0.02, beta_mu=0.02) data$SigmoidData <- 10^6 / (1 + exp(-data$Data)) #saveRDS(data, "results/SK/data.RDS") d1 <- unlist(c(data$SigmoidData)) @@ -657,6 +657,7 @@ legend("topright", c("Generated", "Original"), fill=c(rgb(0,0,1,1/4), rgb(0,1,0, PK <- data$PK_adjacency parts <- data$partition GE <- data$SigmoidData +#saveRDS(data, "data.RDS") ``` ```{r echo=FALSE, fig.keep='all', message=FALSE, results='hide', error=FALSE, warning=FALSE} @@ -677,19 +678,21 @@ algorithms <- c( si_hiton_pc=bnlearn::si.hiton.pc ) -algorithms <-c( - hill_climb=bnlearn::hc, - tabu_search=bnlearn::tabu -) +#algorithms <-c( +# hill_climb=bnlearn::hc, +# tabu_search=bnlearn::tabu +#) to_test_mets <- c("cor", "mc-cor", "smc-cor", "zf", "mc-zf", "smc-zf", "mi-g", "mc-mi-g", "smc-mi-g", "mi-g-sh") -to_test_mets <- c("loglik-g", "aic-g", "bic-g", "ebic-g", "bge", "pred-loglik-g") +to_test_mets <- c("cor") +#to_test_mets <- c("loglik-g", "aic-g", "bic-g", "ebic-g", "bge", "pred-loglik-g") real <- factor( c(data$Real_adjacency[parts[[1]], parts[[2]]], data$Real_adjacency[parts[[2]], parts[[3]]]), levels = c(0, 1)) +saveRDS(data, "results/COMPARE_3500/data.RDS") for(i in 1:length(algorithms)) { @@ -708,14 +711,9 @@ for(i in 1:length(algorithms)) } name_title <- paste(alg_name, "-", met, sep="") - saveRDS(adj, paste("results/SK/SK_algos_sigmoid/", name_title, ".RDS", sep="")) - SK <- amat(adj) - - pred <- factor( - c(SK[parts[[1]], parts[[2]]], SK[parts[[2]], parts[[3]]]), - levels = c(0, 1)) - - draw_confusion_matrix(caret::confusionMatrix(pred,real), title_name = name_title) + adj <- amat(adj) + adj <- data.frame(adj) + saveRDS(adj, paste("results/COMPARE_3500/algos/", name_title, ".RDS", sep="")) } } @@ -864,6 +862,82 @@ cat("accuracy: ", acc * 100, "%\n") draw_confusion_matrix(caret::confusionMatrix(pred,real)) ``` +```{r, warning=FALSE} +source("source/testing.R") + +library(bnlearn) + +results <- data.frame(matrix(NA, 0, 4)) +colnames(results) <- c("Skeleton", "MAP", "Posterior", "SK-SS") +for(i in 1:10) +{ + data <- generate_data(N_samples = 5, prior_FP=0.0, prior_FN=0.5, p_rand = 0.5, + n_circ = 3, n_mirna = 2, n_mrna = 6) + data$SigmoidData <- 10^6 / (1 + exp(-data$Data)) + parts <- data$partition + PK <- data$PK_adjacency + + real <- factor( + c(data$Real_adjacency[parts[[1]], parts[[2]]], data$Real_adjacency[parts[[2]], parts[[3]]]), + levels = c(0, 1)) + + # Skeleton local method + SK <- iamb.fdr(x=data$SigmoidData, undirected = TRUE) + SK <- bnlearn::amat(SK) + SK <- data.frame(SK) + pred <- factor( + unlist(c(SK[parts[[1]], parts[[2]]], SK[parts[[2]], parts[[3]]])), + levels = c(0, 1)) + confusion_mat <- caret::confusionMatrix(pred,real) + shd_sk <- confusion_mat$table[1,2] + confusion_mat$table[2,1] + + ret <- test_all_structures(data) + ret_post <- ret$posterior + ret_post <- lapply(ret_post, function(m) { + storage.mode(m) <- "numeric" + m + }) + ret_map <- ret$map + ret_map <- lapply(ret_map, function(m) { + storage.mode(m) <- "numeric" + m + }) + + # Posterior + pred_post <- factor( + unlist(ret_post), + levels = c(0, 1)) + confusion_mat <- caret::confusionMatrix(pred_post,real) + shd_post <- confusion_mat$table[1,2] + confusion_mat$table[2,1] + # MAP + pred_map <- factor( + unlist(ret_map), + levels = c(0, 1)) + confusion_mat <- caret::confusionMatrix(pred_map,real) + shd_map <- confusion_mat$table[1,2] + confusion_mat$table[2,1] + + # SK-SS + ret <- SK_SS_tripartite(data$SigmoidData,data$PK_adjacency,SK,parts=data$partition, I=20) + PK_out <- ret$PK_out + storage.mode(PK_out) <- "numeric" + pred_skss <- factor( + unlist(c(PK_out[parts[[1]], parts[[2]]], PK_out[parts[[2]], parts[[3]]])), + levels = c(0, 1)) + confusion_mat <- caret::confusionMatrix(pred_skss,real) + shd_skss <- confusion_mat$table[1,2] + confusion_mat$table[2,1] + if(shd_skss == 0) + { + ad <- 1 + } + + + cat("Iter:", i, ", SK:", shd_sk, ", MAP:", shd_map, ", posterior: ", shd_post, "skss:", shd_skss, "\n") + results <- rbind(results, c(shd_sk, shd_map, shd_post, shd_skss)) +} +results <<- results + +``` + ## IntOMICS > **Some explanation:**