diff --git a/source/BiDAG_BGe.R b/source/BiDAG_BGe.R index 8343ca12fce5bdb8d25cf4eb4744c3ce0109c9ee..a2928410eca471c4e5ee683b4f6bfd5e0a15655b 100644 --- a/source/BiDAG_BGe.R +++ b/source/BiDAG_BGe.R @@ -2,9 +2,25 @@ #' BGe score computation, taken from IntOMICS implementation: #' <DAGcorescore_BiDAG_BGe.R> #' -BGe_node_compute <- function(j,parentnodes, param) +BGe_compute <- function(adj, param) { - n <- param$n + nodes_cand <- colnames(adj) + sc <- sum(sapply(nodes_cand, function(node) + { + neights <- names(which(adj[,node] > 0)) + n <- param$n + score <- BGe_node_compute(node, neights, param, n) + return(score) + }) + ) + #cat("Score of part:", sc) + return(sc) +} + + +BGe_node_compute <- function(j,parentnodes, param, n) +{ + #n <- param$n TN <- param$TN awpN <- param$awpN scoreconstvec <- param$scoreconstvec @@ -54,10 +70,9 @@ BGe_node_compute <- function(j,parentnodes, param) #################################################################################################### ## This function is from BiDAG package (https://cran.r-project.org/web/packages/BiDAG/index.html) ## -## arameters needed for calculation of the BGe score ## +## parameters needed for calculation of the BGe score ## ## see arXiv:1402.6863 ## #################################################################################################### - scoreparameters_BiDAG_BGe <- function (n, data, bgepar = list(am = 1, aw = NULL)) diff --git a/source/circGPA/annotate.R b/source/circGPA/annotate.R index 3f58eff6838e4c03d17fe2984942cceeffbe4043..49b6c6c6d944af1030de4cac21112f4aa7497960 100644 --- a/source/circGPA/annotate.R +++ b/source/circGPA/annotate.R @@ -1,95 +1,95 @@ -# This script should calculate annotation of a circRNA with GO terms - -library(polynom) -library(geometry) # only for the dot function -library(tictoc) -source('buildGraph.R') -source('pValue.R') - -setClass("annotationResult", - slots=list( - score="numeric", - pvalue="numeric", - scorenorm="numeric", - expectedScore="numeric", - time="numeric", - pvalueTime="numeric", - bootstrap="numeric", - bootstrapTime="numeric", - goSize="numeric" - ) -) -print.annotationResult <- function(x) { cat(paste("Normalized score:", x@scorenorm, "\n")); cat(paste("p-value:", x@pvalue, "\n")); } - -Im <- buildIm() - -annotateCirc <- function(circRNA, goTermList, goLower = -1, goUpper = Inf) { - muCirc <- buildMuCirc(circRNA) - return(sapply(goTermList, function(goTerm) { - tryCatch({ - print(paste("Probing GO term:", goTerm)) - goMu <- buildGOMu(goTerm) - goM <- buildGOM(goTerm) - goSize <- sum(goMu) + sum(goM) - if(goSize > goUpper | goSize < goLower) { return(NULL) } - sol <- annotateVectorized(muCirc, goMu, goM, Im) - print(sol) - return(sol) - }, error=function(cond) { - print(cond) - return(NULL) - }) - })) -} - -annotate <- function(circRNA, goTerm) { - return(annotateVectorized(buildMuCirc(circRNA), buildGOMu(goTerm), buildGOM(goTerm), Im)) -} - -# dataVector = vector with counts how many times each muRNA/mRNA is connected to the circRNA -buildPolynomial <- function(dataVector) { - tbl <- as.data.frame(table(factor(dataVector, levels = 0:max(dataVector)))) - return(polynomial(coef = tbl[,2])) -} - -# uniform probability vector for goMu or goM - vec(1)*sum(arr) / len(arr) -uniformize <- function(goVec) { - return(rep(1, length(goVec)) * sum(goVec) / length(goVec)) -} - -pValueSimple <- function(muCirc, goMu, goM, ImmuCirc, score) { - # get the p-value, convert the muCirc vector to the polynomial - muCircPoly <- buildPolynomial(muCirc) - ImMuCircPoly <- buildPolynomial(ImmuCirc) - # now calculate power of those to get the number of ways - generatingPoly <- muCircPoly^sum(goMu) * ImMuCircPoly^sum(goM) - generatingCoefs <- coefficients(generatingPoly) - # the p-value is how many of the combinations gave higher score than we have - return(sum(generatingCoefs[(score+1):length(generatingCoefs)]) / sum(generatingCoefs)) # score + 1 as coefficients are x^0 + x^1 indexed from 1 -} - -# goMu - 0/1 vector of is mu in goTerm -# goM the same is mRNA in goTerm -# muCirc - 0/1/more vecotor of circrna neighborhood -# Im - matrix with interactions between muRNA nad miRNA -# test: -# gomu <- c(0,1,1); gom <- c(1,1,1,0,0); mucirc <- c(1,1,1); im <- matrix(c(0,1,1,1,1,1,0,0,1,1,0,0,1,1,0), nrow=5, ncol=3, byrow=TRUE); annotateVectorized(mucirc, gomu, gom, im) -annotateVectorized <- function(muCirc, goMu, goM, Im) { - time <- as.numeric(system.time({ - ImmuCirc <- Im %*% muCirc - score <- dot(muCirc, goMu) + dot(ImmuCirc, goM) # ignore the halfs, I love whole numbers - pvalueTime <- as.numeric(system.time({ - pvalue <- pValueFull(muCirc, goMu, goM, ImmuCirc, score) - })['elapsed']) - bootstrapTime <- as.numeric(system.time({ - bootstrap <- NaN #pValueBootstrap(muCirc, goMu, goM, ImmuCirc, score) # NaN # - })['elapsed']) - expectedScore <- dot(muCirc, uniformize(goMu)) + dot(ImmuCirc, uniformize(goM)) - scorenorm <- score / expectedScore - goSize <- sum(goM) + sum(goMu) - })['elapsed']) - # return - return(new("annotationResult", - score=score, pvalue=pvalue, scorenorm=scorenorm, expectedScore=expectedScore, time=time, pvalueTime=pvalueTime, bootstrap=bootstrap, bootstrapTime=bootstrapTime, goSize=goSize - )) -} +# This script should calculate annotation of a circRNA with GO terms + +library(polynom) +library(geometry) # only for the dot function +library(tictoc) +source('buildGraph.R') +source('pValue.R') + +setClass("annotationResult", + slots=list( + score="numeric", + pvalue="numeric", + scorenorm="numeric", + expectedScore="numeric", + time="numeric", + pvalueTime="numeric", + bootstrap="numeric", + bootstrapTime="numeric", + goSize="numeric" + ) +) +print.annotationResult <- function(x) { cat(paste("Normalized score:", x@scorenorm, "\n")); cat(paste("p-value:", x@pvalue, "\n")); } + +Im <- buildIm() + +annotateCirc <- function(circRNA, goTermList, goLower = -1, goUpper = Inf) { + muCirc <- buildMuCirc(circRNA) + return(sapply(goTermList, function(goTerm) { + tryCatch({ + print(paste("Probing GO term:", goTerm)) + goMu <- buildGOMu(goTerm) + goM <- buildGOM(goTerm) + goSize <- sum(goMu) + sum(goM) + if(goSize > goUpper | goSize < goLower) { return(NULL) } + sol <- annotateVectorized(muCirc, goMu, goM, Im) + print(sol) + return(sol) + }, error=function(cond) { + print(cond) + return(NULL) + }) + })) +} + +annotate <- function(circRNA, goTerm) { + return(annotateVectorized(buildMuCirc(circRNA), buildGOMu(goTerm), buildGOM(goTerm), Im)) +} + +# dataVector = vector with counts how many times each muRNA/mRNA is connected to the circRNA +buildPolynomial <- function(dataVector) { + tbl <- as.data.frame(table(factor(dataVector, levels = 0:max(dataVector)))) + return(polynomial(coef = tbl[,2])) +} + +# uniform probability vector for goMu or goM - vec(1)*sum(arr) / len(arr) +uniformize <- function(goVec) { + return(rep(1, length(goVec)) * sum(goVec) / length(goVec)) +} + +pValueSimple <- function(muCirc, goMu, goM, ImmuCirc, score) { + # get the p-value, convert the muCirc vector to the polynomial + muCircPoly <- buildPolynomial(muCirc) + ImMuCircPoly <- buildPolynomial(ImmuCirc) + # now calculate power of those to get the number of ways + generatingPoly <- muCircPoly^sum(goMu) * ImMuCircPoly^sum(goM) + generatingCoefs <- coefficients(generatingPoly) + # the p-value is how many of the combinations gave higher score than we have + return(sum(generatingCoefs[(score+1):length(generatingCoefs)]) / sum(generatingCoefs)) # score + 1 as coefficients are x^0 + x^1 indexed from 1 +} + +# goMu - 0/1 vector of is mu in goTerm +# goM the same is mRNA in goTerm +# muCirc - 0/1/more vecotor of circrna neighborhood +# Im - matrix with interactions between muRNA nad miRNA +# test: +# gomu <- c(0,1,1); gom <- c(1,1,1,0,0); mucirc <- c(1,1,1); im <- matrix(c(0,1,1,1,1,1,0,0,1,1,0,0,1,1,0), nrow=5, ncol=3, byrow=TRUE); annotateVectorized(mucirc, gomu, gom, im) +annotateVectorized <- function(muCirc, goMu, goM, Im) { + time <- as.numeric(system.time({ + ImmuCirc <- Im %*% muCirc + score <- dot(muCirc, goMu) + dot(ImmuCirc, goM) # ignore the halfs, I love whole numbers + pvalueTime <- as.numeric(system.time({ + pvalue <- pValueFull(muCirc, goMu, goM, ImmuCirc, score) + })['elapsed']) + bootstrapTime <- as.numeric(system.time({ + bootstrap <- NaN #pValueBootstrap(muCirc, goMu, goM, ImmuCirc, score) # NaN # + })['elapsed']) + expectedScore <- dot(muCirc, uniformize(goMu)) + dot(ImmuCirc, uniformize(goM)) + scorenorm <- score / expectedScore + goSize <- sum(goM) + sum(goMu) + })['elapsed']) + # return + return(new("annotationResult", + score=score, pvalue=pvalue, scorenorm=scorenorm, expectedScore=expectedScore, time=time, pvalueTime=pvalueTime, bootstrap=bootstrap, bootstrapTime=bootstrapTime, goSize=goSize + )) +} diff --git a/source/circGPA/buildGraph.R b/source/circGPA/buildGraph.R index 9670d1f11c7f5f2802b0fe13eec63cdfcdc70385..15b1677fb64434098186075a71d5db54e344918a 100644 --- a/source/circGPA/buildGraph.R +++ b/source/circGPA/buildGraph.R @@ -1,104 +1,104 @@ - # BiocManager::install("miRBaseConverter") -# BiocManager::install("multiMiR") - -library(miRBaseConverter) - -#library(msigdbr) -library(GO.db) -library(org.Hs.eg.db) -library("biomaRt") -library(httr) -httr::set_config(config(ssl_verifypeer = FALSE)) - -library(multiMiR) - -#library(Rfast) # for binary search - no way, only double - -#source("spidermirdownload.R") -source("circinteractome.R") -source("mirnaGO.R") - -#all_gene_sets = msigdbr(species = "Homo sapiens") -#allegs = get("GO:0070104", org.Hs.egGO2ALLEGS) -#genes = unlist(mget(allegs,org.Hs.egSYMBOL)) - -# see https://support.bioconductor.org/p/124462/ -listGenes <- function() { - tryCatch({ - mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl") - all_coding_genes <- getBM(attributes = c( "hgnc_symbol"), filters = c("biotype"), values = list(biotype="protein_coding"), mart = mart) - return(sort(all_coding_genes$hgnc_symbol)) - }, error=function(cond) { - print(cond) - return(readRDS("all_coding_genes.RDS")) - }) -} - -listMiRNA <- function() { - miRNATab=getMiRNATable(version="v22",species="hsa") - return(sort(unique(miRNATab$Mature1))) -} - -multiMirGeneInteract <- function() { - map <- read.csv("mirna-mrna-interactions.csv") - map <- subset(map, select = -c(X) ) - return(map) -} - -if (file.exists("graph.RData")) { - load("graph.RData") -} else { - all_gene_list <- listGenes() - all_mirna_list <- listMiRNA() - # save(all_gene_list, all_mirna_list, file = "graph.RData") -} - -geneInteract <- multiMirGeneInteract() -circ_mirna_map <- loadMap() -mirna_go_map <- loadMapMiRNAGoTerms() - -buildIm <- function() { - Im <- matrix(0, nrow=length(all_gene_list), ncol=length(all_mirna_list)) - # for(i in 1:nrow(geneInteract)) { - # miRNA <- geneInteract[i,1] - # gene <- geneInteract[i,2] - # - # miRNAI <- which(all_mirna_list == miRNA) - # geneI <- which(all_gene_list == gene) - # - # if(length(miRNAI) == 0 | length(geneI) == 0) { next } - # - # Im[geneI[1], miRNAI[1]] <- 1 - # } - - geneInteract <- geneInteract[geneInteract[,1] %in% all_mirna_list,] - geneInteract <- geneInteract[geneInteract[,2] %in% all_gene_list,] - geneInteract[,1] <- match(geneInteract[,1], all_mirna_list) - geneInteract[,2] <- match(geneInteract[,2], all_gene_list) - - Im[cbind(geneInteract[,2], geneInteract[,1])] <- 1 - - - return(Im) -} - -buildMuCirc <- function(circRNA) { - interactions <- circ_mirna_map[circ_mirna_map$circRNA == circRNA,] - return(as.numeric(all_mirna_list %in% interactions$miRNAs)) -} - -buildGOMu <- function(GOterm) { - goid <- names(Term(GOTERM)[which(longnames == GOterm)]) - mirnas <- mirna_go_map$Mature1[mirna_go_map$goTerm == GOterm | mirna_go_map$goTerm == goid] - return(as.numeric(all_mirna_list %in% mirnas)) -} - -buildGOM <- function(GOterm) { - if(GOterm %in% names(m_list)) { - genes <- unlist(m_list[names(m_list) == GOterm]) - } else { - allegs = get(GOterm, org.Hs.egGO2ALLEGS) - genes = unlist(mget(allegs,org.Hs.egSYMBOL)) - } - return(as.numeric(all_gene_list %in% genes)) -} + # BiocManager::install("miRBaseConverter") +# BiocManager::install("multiMiR") + +library(miRBaseConverter) + +#library(msigdbr) +library(GO.db) +library(org.Hs.eg.db) +library("biomaRt") +library(httr) +httr::set_config(config(ssl_verifypeer = FALSE)) + +library(multiMiR) + +#library(Rfast) # for binary search - no way, only double + +#source("spidermirdownload.R") +source("circinteractome.R") +source("mirnaGO.R") + +#all_gene_sets = msigdbr(species = "Homo sapiens") +#allegs = get("GO:0070104", org.Hs.egGO2ALLEGS) +#genes = unlist(mget(allegs,org.Hs.egSYMBOL)) + +# see https://support.bioconductor.org/p/124462/ +listGenes <- function() { + tryCatch({ + mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl") + all_coding_genes <- getBM(attributes = c( "hgnc_symbol"), filters = c("biotype"), values = list(biotype="protein_coding"), mart = mart) + return(sort(all_coding_genes$hgnc_symbol)) + }, error=function(cond) { + print(cond) + return(readRDS("all_coding_genes.RDS")) + }) +} + +listMiRNA <- function() { + miRNATab=getMiRNATable(version="v22",species="hsa") + return(sort(unique(miRNATab$Mature1))) +} + +multiMirGeneInteract <- function() { + map <- read.csv("mirna-mrna-interactions.csv") + map <- subset(map, select = -c(X) ) + return(map) +} + +if (file.exists("graph.RData")) { + load("graph.RData") +} else { + all_gene_list <- listGenes() + all_mirna_list <- listMiRNA() + # save(all_gene_list, all_mirna_list, file = "graph.RData") +} + +geneInteract <- multiMirGeneInteract() +circ_mirna_map <- loadMap() +mirna_go_map <- loadMapMiRNAGoTerms() + +buildIm <- function() { + Im <- matrix(0, nrow=length(all_gene_list), ncol=length(all_mirna_list)) + # for(i in 1:nrow(geneInteract)) { + # miRNA <- geneInteract[i,1] + # gene <- geneInteract[i,2] + # + # miRNAI <- which(all_mirna_list == miRNA) + # geneI <- which(all_gene_list == gene) + # + # if(length(miRNAI) == 0 | length(geneI) == 0) { next } + # + # Im[geneI[1], miRNAI[1]] <- 1 + # } + + geneInteract <- geneInteract[geneInteract[,1] %in% all_mirna_list,] + geneInteract <- geneInteract[geneInteract[,2] %in% all_gene_list,] + geneInteract[,1] <- match(geneInteract[,1], all_mirna_list) + geneInteract[,2] <- match(geneInteract[,2], all_gene_list) + + Im[cbind(geneInteract[,2], geneInteract[,1])] <- 1 + + + return(Im) +} + +buildMuCirc <- function(circRNA) { + interactions <- circ_mirna_map[circ_mirna_map$circRNA == circRNA,] + return(as.numeric(all_mirna_list %in% interactions$miRNAs)) +} + +buildGOMu <- function(GOterm) { + goid <- names(Term(GOTERM)[which(longnames == GOterm)]) + mirnas <- mirna_go_map$Mature1[mirna_go_map$goTerm == GOterm | mirna_go_map$goTerm == goid] + return(as.numeric(all_mirna_list %in% mirnas)) +} + +buildGOM <- function(GOterm) { + if(GOterm %in% names(m_list)) { + genes <- unlist(m_list[names(m_list) == GOterm]) + } else { + allegs = get(GOterm, org.Hs.egGO2ALLEGS) + genes = unlist(mget(allegs,org.Hs.egSYMBOL)) + } + return(as.numeric(all_gene_list %in% genes)) +} diff --git a/source/circGPA/circinteractome.R b/source/circGPA/circinteractome.R index 837d2f7512cb0932d7c8ac9076ba216a15ad0437..51c4c30b8a0233bc76118dc94556606029343e53 100644 --- a/source/circGPA/circinteractome.R +++ b/source/circGPA/circinteractome.R @@ -1,61 +1,61 @@ -# this code is used to download interacting miRNAs from the CircInteractome database -# unfortunatelly, CircInteractome is the only database working now (CircNet is down) -# and it has only "User-Input web interface" - -# for http request (was preinstalled on RCI) -library("httr") -# for regular expressions -library("stringr") - -#r <- GET("https://circinteractome.nia.nih.gov/api/v2/mirnasearch?circular_rna_query=hsa_circ_0007848&mirna_query=&submit=miRNA+Target+Search") -#content(r, "text") -# hsa-miR-xxxx -# regex hsa-(mir|miR|miRNA|mirna|microRNA|microrna|let|lin)-[[:alnum:]\\-]{1,10} - -circInteractome <- function(circRNA) { - print(paste("Downloading circRNA interactions for", circRNA)) - url <- paste("https://circinteractome.nia.nih.gov/api/v2/mirnasearch?circular_rna_query=", circRNA, "&mirna_query=&submit=miRNA+Target+Search", sep="") - r <- GET(url) - if (r$status_code != 200) { - stop("problem connecting to the circinteractome database") - } - mirnas <- str_extract_all(content(r, "text"), "hsa-(mir|miR|miRNA|microRNA|let|lin)-[[:alnum:]\\-]{1,10}")[[1]] - unique(mirnas) -} - -circInteractomeList <- function(circRNAs) { - result <- data.frame() - - for(circRNA in circRNAs) { - miRNAs <- circInteractome(circRNA) - newRows <- cbind(circRNA, miRNAs) - result <- rbind(result, newRows) - } - - result -} - -loadMap <- function() { - tryCatch({ - map <- read.csv('circrna-mirna-interactions.csv') - subset(map, select = -c(X) ) - }, error = function(e) { - data.frame(circRNA=character(0),miRNAs=character(0)) - }) -} - -circInteractomeListCached <- function(circRNAs, wait=TRUE) { - for(circRNA in circRNAs) { - map <- loadMap() - if(circRNA %in% map$circRNA) { next } - print(paste("querying", circRNA)) - miRNAs <- circInteractome(circRNA) - newRows <- cbind(circRNA, miRNAs) - map <- rbind(map, newRows) - write.csv(as.data.frame(map), file='circrna-mirna-interactions.csv') - if(wait) { Sys.sleep(50) } - } - - map <- loadMap() - return(map[map[,1] %in% circRNAs,]) -} +# this code is used to download interacting miRNAs from the CircInteractome database +# unfortunatelly, CircInteractome is the only database working now (CircNet is down) +# and it has only "User-Input web interface" + +# for http request (was preinstalled on RCI) +library("httr") +# for regular expressions +library("stringr") + +#r <- GET("https://circinteractome.nia.nih.gov/api/v2/mirnasearch?circular_rna_query=hsa_circ_0007848&mirna_query=&submit=miRNA+Target+Search") +#content(r, "text") +# hsa-miR-xxxx +# regex hsa-(mir|miR|miRNA|mirna|microRNA|microrna|let|lin)-[[:alnum:]\\-]{1,10} + +circInteractome <- function(circRNA) { + print(paste("Downloading circRNA interactions for", circRNA)) + url <- paste("https://circinteractome.nia.nih.gov/api/v2/mirnasearch?circular_rna_query=", circRNA, "&mirna_query=&submit=miRNA+Target+Search", sep="") + r <- GET(url) + if (r$status_code != 200) { + stop("problem connecting to the circinteractome database") + } + mirnas <- str_extract_all(content(r, "text"), "hsa-(mir|miR|miRNA|microRNA|let|lin)-[[:alnum:]\\-]{1,10}")[[1]] + unique(mirnas) +} + +circInteractomeList <- function(circRNAs) { + result <- data.frame() + + for(circRNA in circRNAs) { + miRNAs <- circInteractome(circRNA) + newRows <- cbind(circRNA, miRNAs) + result <- rbind(result, newRows) + } + + result +} + +loadMap <- function() { + tryCatch({ + map <- read.csv('circrna-mirna-interactions.csv') + subset(map, select = -c(X) ) + }, error = function(e) { + data.frame(circRNA=character(0),miRNAs=character(0)) + }) +} + +circInteractomeListCached <- function(circRNAs, wait=TRUE) { + for(circRNA in circRNAs) { + map <- loadMap() + if(circRNA %in% map$circRNA) { next } + print(paste("querying", circRNA)) + miRNAs <- circInteractome(circRNA) + newRows <- cbind(circRNA, miRNAs) + map <- rbind(map, newRows) + write.csv(as.data.frame(map), file='circrna-mirna-interactions.csv') + if(wait) { Sys.sleep(50) } + } + + map <- loadMap() + return(map[map[,1] %in% circRNAs,]) +} diff --git a/source/circGPA/circrna-mirna-interactions.csv b/source/circGPA/circrna-mirna-interactions.csv index 30495ae07339f19dbaba482bbd3153ab8c426182..2cb6f25235f401ccfde91aba7a2488619d993836 100644 Binary files a/source/circGPA/circrna-mirna-interactions.csv and b/source/circGPA/circrna-mirna-interactions.csv differ diff --git a/source/circGPA/mirna-mrna-interactions.csv b/source/circGPA/mirna-mrna-interactions.csv index a2db8eeb22d71e2457b2e8ace418f15e0730da94..1c3d1f38f6d1b2914797421a24138f43d65153e3 100644 Binary files a/source/circGPA/mirna-mrna-interactions.csv and b/source/circGPA/mirna-mrna-interactions.csv differ diff --git a/source/circGPA/run.R b/source/circGPA/run.R index e46259f5e1031f23c62ce0932961bd6ff62e76eb..de90cbbad1126e384448ba648bd5c423c3104e67 100644 --- a/source/circGPA/run.R +++ b/source/circGPA/run.R @@ -1,101 +1,101 @@ -#args = commandArgs(trailingOnly=TRUE) -#circname <- args[1] - -prev_wd <- getwd() -setwd("circGPA/") -library(stringr) -source('annotate.R') -source('goterms.R') -library("openxlsx") -setwd(prev_wd) -remove(prev_wd) - -resultsToDataFrame <- function(results) { - names <- slotNames('annotationResult') - df <- sapply(names, function(name) lapply(results, function(r) { if(is.null(r)) NA else slot(r, name) })) - # order the data frame - #df <- df[is.null(df[,"pvalue"]),] - df <- df[order(unlist(df[,"pvalue"])),] - merge(df, as.data.frame(goTerms), by="row.names") -} - -#wanted_go_terms <- sample(goTermNames, 100) -wanted_go_terms <- goTermNames - -circGPA1corr <- read.xlsx('../data/hsa_circ_0000227-correlated-filtered-short.xlsx', sheet=1, rowNames=TRUE, colNames = TRUE) -circGPA2corr <- read.xlsx('../data/hsa_circ_0000228-correlated-filtered-short.xlsx', sheet=1, rowNames=TRUE, colNames = TRUE) -circGPA3corr <- read.xlsx('../data/hsa_circ_0003793-correlated-filtered-short.xlsx', sheet=1, rowNames=TRUE, colNames = TRUE) - -go_1 <- rownames(circGPA1corr) -go_2 <- rownames(circGPA2corr) -go_3 <- rownames(circGPA3corr) - -wanted_go_terms <- intersect(intersect(go_1, go_2), go_3) - -remove(go_1) -remove(go_2) -remove(go_3) - -circGPA1corr <- circGPA1corr[rownames(circGPA1corr) %in% wanted_go_terms,] -circGPA2corr <- circGPA1corr[rownames(circGPA2corr) %in% wanted_go_terms,] -circGPA3corr <- circGPA1corr[rownames(circGPA3corr) %in% wanted_go_terms,] - -runOnCirc <- function(circRNA, goLower = 10, goUpper = 1000) { - results <- annotateCirc(circRNA, wanted_go_terms) - df <- resultsToDataFrame(results) - df <- df[order(unlist(df[,"pvalue"])),] - df <- cbind(df, bonferroni=p.adjust(df$pvalue, method="bonferroni"), fdr=p.adjust(df$pvalue, method="fdr")) - return(df) - tst <<- df - - write.xlsx(df, paste(circRNA, ".xlsx", sep="")) - - df <- df[!is.na(df$pvalue) & df$goSize >= goLower & df$goSize <= goUpper,] - df$bonferroni <- p.adjust(df$pvalue, method="bonferroni") - df$fdr <- p.adjust(df$pvalue, method="fdr") - if(nrow(df) >= 1) { - write.xlsx(df, paste(circRNA, "-short.xlsx", sep="")) - } -} - -buildIm <- function() { - Im <- matrix(0, nrow=length(all_gene_list), ncol=length(all_mirna_list)) - for(i in 1:nrow(geneInteract)) { - miRNA <- geneInteract[i,1] - gene <- geneInteract[i,2] - - miRNAI <- which(all_mirna_list == miRNA) - geneI <- which(all_gene_list == gene) - - if(length(miRNAI) == 0 | length(geneI) == 0) { next } - - Im[geneI[1], miRNAI[1]] <- 1 - } - return(Im) -} - -buildMuCirc <- function(circRNA) { - interactions <- circ_mirna_map[circ_mirna_map$circRNA == circRNA,] - return(as.numeric(all_mirna_list %in% interactions$miRNAs)) -} - -circname1 <- "hsa_circ_0000227" -circname2 <- "hsa_circ_0000228" -circname3 <- "hsa_circ_0003793" - - -df1 <- runOnCirc(circname1) -saveRDS(df1, paste(circname1, "_circGPA.RData", sep="")) -df2 <- runOnCirc(circname2) -saveRDS(df2, paste(circname2, "_circGPA.RData", sep="")) -df3 <- runOnCirc(circname3) -saveRDS(df3, paste(circname3, "_circGPA.RData", sep="")) - - -res1 <- cor.test(x=df1$fdr, y=circGPA1corr$fdr, method = 'spearman') -res2 <- cor.test(x=df2$fdr, y=circGPA2corr$fdr, method = 'spearman') -res3 <- cor.test(x=df3$fdr, y=circGPA3corr$fdr, method = 'spearman') -#runOnCirc("hsa_circ_0003793") - -#runOnCirc(circname) - +#args = commandArgs(trailingOnly=TRUE) +#circname <- args[1] + +prev_wd <- getwd() +setwd("circGPA/") +library(stringr) +source('annotate.R') +source('goterms.R') +library("openxlsx") +setwd(prev_wd) +remove(prev_wd) + +resultsToDataFrame <- function(results) { + names <- slotNames('annotationResult') + df <- sapply(names, function(name) lapply(results, function(r) { if(is.null(r)) NA else slot(r, name) })) + # order the data frame + #df <- df[is.null(df[,"pvalue"]),] + df <- df[order(unlist(df[,"pvalue"])),] + merge(df, as.data.frame(goTerms), by="row.names") +} + +#wanted_go_terms <- sample(goTermNames, 100) +wanted_go_terms <- goTermNames + +circGPA1corr <- read.xlsx('../data/hsa_circ_0000227-correlated-filtered-short.xlsx', sheet=1, rowNames=TRUE, colNames = TRUE) +circGPA2corr <- read.xlsx('../data/hsa_circ_0000228-correlated-filtered-short.xlsx', sheet=1, rowNames=TRUE, colNames = TRUE) +circGPA3corr <- read.xlsx('../data/hsa_circ_0003793-correlated-filtered-short.xlsx', sheet=1, rowNames=TRUE, colNames = TRUE) + +go_1 <- rownames(circGPA1corr) +go_2 <- rownames(circGPA2corr) +go_3 <- rownames(circGPA3corr) + +wanted_go_terms <- intersect(intersect(go_1, go_2), go_3) + +remove(go_1) +remove(go_2) +remove(go_3) + +circGPA1corr <- circGPA1corr[rownames(circGPA1corr) %in% wanted_go_terms,] +circGPA2corr <- circGPA1corr[rownames(circGPA2corr) %in% wanted_go_terms,] +circGPA3corr <- circGPA1corr[rownames(circGPA3corr) %in% wanted_go_terms,] + +runOnCirc <- function(circRNA, goLower = 10, goUpper = 1000) { + results <- annotateCirc(circRNA, wanted_go_terms) + df <- resultsToDataFrame(results) + df <- df[order(unlist(df[,"pvalue"])),] + df <- cbind(df, bonferroni=p.adjust(df$pvalue, method="bonferroni"), fdr=p.adjust(df$pvalue, method="fdr")) + return(df) + tst <<- df + + write.xlsx(df, paste(circRNA, ".xlsx", sep="")) + + df <- df[!is.na(df$pvalue) & df$goSize >= goLower & df$goSize <= goUpper,] + df$bonferroni <- p.adjust(df$pvalue, method="bonferroni") + df$fdr <- p.adjust(df$pvalue, method="fdr") + if(nrow(df) >= 1) { + write.xlsx(df, paste(circRNA, "-short.xlsx", sep="")) + } +} + +buildIm <- function() { + Im <- matrix(0, nrow=length(all_gene_list), ncol=length(all_mirna_list)) + for(i in 1:nrow(geneInteract)) { + miRNA <- geneInteract[i,1] + gene <- geneInteract[i,2] + + miRNAI <- which(all_mirna_list == miRNA) + geneI <- which(all_gene_list == gene) + + if(length(miRNAI) == 0 | length(geneI) == 0) { next } + + Im[geneI[1], miRNAI[1]] <- 1 + } + return(Im) +} + +buildMuCirc <- function(circRNA) { + interactions <- circ_mirna_map[circ_mirna_map$circRNA == circRNA,] + return(as.numeric(all_mirna_list %in% interactions$miRNAs)) +} + +circname1 <- "hsa_circ_0000227" +circname2 <- "hsa_circ_0000228" +circname3 <- "hsa_circ_0003793" + + +df1 <- runOnCirc(circname1) +saveRDS(df1, paste(circname1, "_circGPA.RData", sep="")) +df2 <- runOnCirc(circname2) +saveRDS(df2, paste(circname2, "_circGPA.RData", sep="")) +df3 <- runOnCirc(circname3) +saveRDS(df3, paste(circname3, "_circGPA.RData", sep="")) + + +res1 <- cor.test(x=df1$fdr, y=circGPA1corr$fdr, method = 'spearman') +res2 <- cor.test(x=df2$fdr, y=circGPA2corr$fdr, method = 'spearman') +res3 <- cor.test(x=df3$fdr, y=circGPA3corr$fdr, method = 'spearman') +#runOnCirc("hsa_circ_0003793") + +#runOnCirc(circname) + diff --git a/source/circGPA_analysis.R b/source/circGPA_analysis.R index 898995a36d72f8df8450edcbf764702140a5822a..9b585deadf53e228760d36c635545813f95d2437 100644 --- a/source/circGPA_analysis.R +++ b/source/circGPA_analysis.R @@ -6,6 +6,45 @@ source('annotate.R') source('goterms.R') library("openxlsx") + +circGPA_original_map_mirna_mrna <- "CIRCGPA_mirna-mrna-interactions.csv" +circGPA_original_map_circ_mirna <- "CIRCGPA_circrna-mirna-interactions.csv" + +reset_to_circGPA_matrix <- function() { + old.loc <- paste("data/", circGPA_original_map_mirna_mrna, sep="") + new.loc <- paste("source/circGPA/", circGPA_original_map_mirna_mrna, sep="") + + file.copy( + old.loc, + new.loc + ) + file.rename( + new.loc, + paste( + "source/circGPA/", + sub('^CIRCGPA_', '', circGPA_original_map_mirna_mrna), + sep="") + ) + + + old.loc <- paste("data/", circGPA_original_map_circ_mirna, sep="") + new.loc <- paste("source/circGPA/", circGPA_original_map_circ_mirna, sep="") + + file.copy( + old.loc, + new.loc + ) + file.rename( + new.loc, + paste( + "source/circGPA/", + sub('^CIRCGPA_', '', circGPA_original_map_circ_mirna), + sep="") + ) + + source("source/circGPA_analysis.R") +} + resultsToDataFrame <- function(results) { names <- slotNames('annotationResult') df <- sapply(names, function(name) lapply(results, function(r) { if(is.null(r)) NA else slot(r, name) })) diff --git a/source/data_circGPA_preprocess.R b/source/data_circGPA_preprocess.R index 8285464a0d23ce9aa85ea1f92866d1db0ff07415..adc1ae4eb2134ead2af6832dc2b4f36c2a416d5b 100644 --- a/source/data_circGPA_preprocess.R +++ b/source/data_circGPA_preprocess.R @@ -129,11 +129,6 @@ matureMIRNA_remove_suffix <- function(mirnas) # TODO remove { divide_combined_form <- function(combo) { - #combo_orig <- combo - if(combo == "MIR103A1") - { - tst <- 1 - } # first --- remove some suffix in form of "-5p" combo <- strsplit(combo, split="-")[[1]][1] diff --git a/source/empirical_prior.R b/source/empirical_prior.R new file mode 100644 index 0000000000000000000000000000000000000000..c79c8ce31c2f17f593366b59cb51d777d4144881 --- /dev/null +++ b/source/empirical_prior.R @@ -0,0 +1,147 @@ +library(matrixStats) +library(VeryLargeIntegers) + +#' +#' For the derivation of approximation see: +#' https://math.stackexchange.com/questions/64716/approximating-the-logarithm-of-the-binomial-coefficient +#' +f_log_combi <- function(n, k) +{ + if((k == 0) || (k == n)) return(0) + + # For tractable (inside the INT32 range) inputs we compute the precise number + if(n < 30) + { + ret <- asnumeric(binom(n, k)) + ret <- log(ret) + return(ret) + } + + # Alternatively: + # log(x, base=exp(1)) returns wrong numbers + # loge ---> NO, for large numbers (f.e. 45489, 200) it returns NA!!! + #ret <- binom(n, k) + #ret <- log(ret, base=exp(1)) + #ret <- asnumeric(ret) + #exact_num <- ret + + # Otherwise we use the Stirling's approximation (very close, O(1/n, 1/k)): + ret <- n * log(n) - k * log(k) - (n-k) * log(n - k) + ret <- ret + 0.5 * (log(n) - log(k) - log(n - k) - log(2 * pi)) + + return(ret) +} + +possible_log_z_b <- function(beta, K) +{ + ret <- K * log(1 + exp(-beta)) + return(ret) + + + + # inps <- 0:K + # inps <- sapply(inps, function(i) { + # ret1 <- f_log_combi(K, i) + # #ret2 <- (N - K) * 0.5 + i + # ret2 <- i * beta + # ret <- ret1 - ret2 + # return(ret) + # }) + # inps <- logSumExp(inps) + # + # cat(ret, " x ", inps, "\n") + # return(inps) +} + +fast_point_log_P_G <- function(beta, K, E) +{ + log_z_b <- possible_log_z_b(beta, K) + ret <- -beta * E - log_z_b + return(ret) +} + +myclamp <- function(val, lower, upper) +{ + if(val < lower) return(lower) + if(val > upper) return(upper) + return(val) +} + + +optimum_beta <- function(E, K, beta_L, beta_H) +{ + # If E == 0, can't divide by 0, take + if(E == 0) return(beta_H) + + ret <- K / E - 1 + ret <- log(ret) + ret <- myclamp(ret, lower=beta_L, upper=beta_H) + return(ret) +} + +marginalization_P_G <- function(E, K, beta_L, beta_H, delta_beta=0.1) +{ + betas <- seq(beta_L, beta_H, delta_beta) + + ret <- sapply(betas, function(b) fast_point_log_P_G(b, K, E)) + + ret <- logSumExp(ret) + + ret <- ret - log(beta_H - beta_L) + log(delta_beta) + + return(ret) +} + + +#' +#' Compute a prior energy function of the given adjacency graph. Formula: +#' E(i,j) = +#' (if edge in G) = 1 - B(i,j) +#' +#' (if edge not in G) = B(i,j) +#' +#' E(G) = sum(i,j) E(i,j) +#' +#' For reference see: +#' * IntOMICS: +#' https://doi.org/10.1186/s12859-022-04891-9 +#' * Bayesian prior see: +#' https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957076/ +#' +#' @param G_adjs A list of adjacency matrices across each partition interaction, for example: list(CIRC->MI, MI->M) +#' @param B_PKs A list of prior matrices for each partition interaction, for example: list(CIRC->MI, MI->M), ! modified to scores [1,0.5,0] +#' @param N A total number of genes +#' @returns E(G), the prior energy function for a given graph +#' +Energy_compute <- function(G_adjs, B_PKs, N) +{ + E <- 0 + + # we should count blocked edges too, + # they have 0 prior probability -> E(i,j) = 1 + # we do it by doing (N x N) - (non-blocked number) + E_non_blocked <- 0 + + for(i in 1:length(G_adjs)) # for each partition interaction separately + { + # IntOMICS scoring + # if 1 ---> 1-B(i,j), ...., if 0 ----> B(i,j) + # E <- sum(G_adjs[[i]] * (1-B_PKs[[i]]) + (1-G_adjs[[i]]) * B_PKs[[i]]) + + + # BP(Isci) scoring + # if 1 ---> 1-B(i,j), ...., if 0 ----> 1 + E <- sum(1 - B_PKs[[i]] * G_adjs[[i]]) + E + + # Add non-blocked + E_non_blocked <- E_non_blocked + ncol(B_PKs[[i]]) * nrow(B_PKs[[i]]) + + } + + # in both scoring we add +1 for each blocked edge too + # to compute it we just compute non-blocked and then: + # [TOTAL] - [non-BLOCKED] = [BLOCKED] = N^2 - [non-BLOCKED] + E <- (E + N*N - E_non_blocked) + + return(E) +} diff --git a/source/general.R b/source/general.R index fda93cd00827b5f595a6d378b309d29b7ede0fda..6151f2130ad9137e225c5132c800139b6d7325d8 100644 --- a/source/general.R +++ b/source/general.R @@ -1,25 +1,25 @@ - - -src.source <- function(nm) -{ - prev_wd <- getwd() - setwd(getSrcDirectory(function(){})[1]) - #setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) - source(nm) - setwd(prev_wd) -} - -src.cppsource <- function(nm) -{ - prev_wd <- getwd() - setwd(getSrcDirectory(function(){})[1]) - #setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) - Rcpp::sourceCpp(nm) - setwd(prev_wd) -} - -msg <- function(x, width = 120) -{ - message("\r", rep(" ", times = width - length(x)), "\r", appendLF = F) - message(x, appendLF = F) + + +src.source <- function(nm) +{ + prev_wd <- getwd() + setwd(getSrcDirectory(function(){})[1]) + #setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) + source(nm) + setwd(prev_wd) +} + +src.cppsource <- function(nm) +{ + prev_wd <- getwd() + setwd(getSrcDirectory(function(){})[1]) + #setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) + Rcpp::sourceCpp(nm) + setwd(prev_wd) +} + +msg <- function(x, width = 120) +{ + message("\r", rep(" ", times = width - length(x)), "\r", appendLF = F) + message(x, appendLF = F) } \ No newline at end of file diff --git a/source/mmpc_optimized.R b/source/mmpc_optimized.R index 7c24934e90ba1b694cd4eeb790db91a8000f77cc..b5ba0ede9112ab07958343dcf8613fc4f3531b2e 100644 --- a/source/mmpc_optimized.R +++ b/source/mmpc_optimized.R @@ -1,3 +1,6 @@ +library(bnlearn) +suppressWarnings(attach(getNamespace("bnlearn"))) + mmpc_bnlearn_tripartite <- function(x, cluster, partitions, whitelist = NULL, test = NULL, alpha = 0.05, B = NULL, max.sx = NULL, debug = FALSE, undirected = TRUE) @@ -124,6 +127,7 @@ mmpc_bnlearn_tripartite <- function(x, cluster, partitions, whitelist = NULL, for (node in nodes) mb[[node]]$mb = fake.markov.blanket(mb, node) + mb = bn.recovery(mb, nodes = nodes, debug = debug) return(mb) } @@ -138,7 +142,7 @@ mmpc_bnlearn_tripartite <- function(x, cluster, partitions, whitelist = NULL, my_mid <- mask[idx] x <- all_nodes[idx] - cat("Node: ", x, ", myid:", my_mid, "\n") + #cat("Node: ", x, ", myid:", my_mid, "\n") if(my_mid == length(partitions)) { @@ -182,17 +186,17 @@ mmpc_bnlearn_tripartite <- function(x, cluster, partitions, whitelist = NULL, to.be.checked = setdiff(names(which(association < alpha)), cpc) - cat("Node: ", idx, ", CPC: [", length(cpc), " + ", length(to.be.checked),"]\n") + #cat("Node: ", idx, ", CPC: [", length(cpc), " + ", length(to.be.checked),"]\n") - start_time <- Sys.time() + #start_time <- Sys.time() association = sapply(to.be.checked, maxmin.pc.heuristic.optimized, y = x, sx = cpc, data = data, test = test, alpha = alpha, B = B, association = association, complete = complete, debug = FALSE) - end_time <- Sys.time() - cat("Node: ", idx, ", CPC: [", length(cpc), " + ", length(to.be.checked),"], t: ", end_time - start_time,"\n") + #end_time <- Sys.time() + #cat("Node: ", idx, ", CPC: [", length(cpc), " + ", length(to.be.checked),"], t: ", end_time - start_time,"\n") if (all(association > alpha) || length(nodes) == 0 || @@ -219,7 +223,7 @@ mmpc_bnlearn_tripartite <- function(x, cluster, partitions, whitelist = NULL, { my_mid <- mask[idx] x <- all_nodes[idx] - cat("Node: ", x, ", myid:", my_mid) + #cat("Node: ", x, ", myid:", my_mid) if(my_mid == length(partitions)) { diff --git a/source/skeleton.R b/source/skeleton.R index 2930839088a85d132c1d1993db2e8bbd8e2ff182..73f8f229da5f0e84c27abb92f4e8c87c22410b12 100644 --- a/source/skeleton.R +++ b/source/skeleton.R @@ -1,22 +1,14 @@ prev_wd <- getwd() -setwd(getSrcDirectory(function(){})[1]) +#setwd(getSrcDirectory(function(){})[1]) + +this.dir <- dirname(parent.frame(2)$ofile) +setwd(this.dir) source("BiDAG_BGe.R") +source("empirical_prior.R") Rcpp::sourceCpp("skeleton.cpp") +library(matrixStats) -BGe_compute <- function(adj, param) -{ - nodes_cand <- colnames(adj) - sc <- sum(sapply(nodes_cand, function(node) - { - neights <- names(which(adj[,node] > 0)) - score <- BGe_node_compute(node, neights, param) - return(score) - }) - ) - cat("Score of part:", sc) - return(sc) -} setwd(prev_wd) remove(prev_wd) @@ -27,13 +19,14 @@ BGe_change <- function(part_id, from, to, action_is_add, score_prev, G_adjs, par node <- colnames(adj)[to] neights <- names(which(adj[,node] > 0)) - score_node_prev <- BGe_node_compute(node, neights, param) + n <- param$n + score_node_prev <- BGe_node_compute(node, neights, param, n) adj[from, to] <- action_is_add node <- colnames(adj)[to] neights <- names(which(adj[,node] > 0)) - score_node <- BGe_node_compute(node, neights, param) + score_node <- BGe_node_compute(node, neights, param, n) adj[from, to] <- (1 - action_is_add) @@ -43,91 +36,49 @@ BGe_change <- function(part_id, from, to, action_is_add, score_prev, G_adjs, par return(ret) } - -#' -#' Compute a prior energy function of the given adjacency graph. Formula: -#' E(i,j) = -#' (if edge in G) = 1 - B(i,j) -#' -#' (if edge not in G) = B(i,j) -#' -#' E(G) = sum(i,j) E(i,j) -#' -#' For reference see: -#' * IntOMICS: -#' https://doi.org/10.1186/s12859-022-04891-9 -#' * Bayesian prior see: -#' https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957076/ -#' -#' @param G_adjs A list of adjacency matrices across each partition interaction, for example: list(CIRC->MI, MI->M) -#' @param B_PKs A list of prior matrices for each partition interaction, for example: list(CIRC->MI, MI->M), ! modified to scores [1,0.5,0] -#' @param N A total number of genes -#' @returns E(G), the prior energy function for a given graph -#' -Energy_compute <- function(G_adjs, B_PKs, N) -{ - E <- 0 - - # we should count blocked edges too, - # they have 0 prior probability -> E(i,j) = 1 - # we do it by doing (N x N) - (non-blocked number) - E_non_blocked <- 0 - - for(i in 1:length(G_adjs)) # for each partition interaction separately - { - # IntOMICS scoring - # if 1 ---> 1-B(i,j), ...., if 0 ----> B(i,j) - # E <- sum(G_adjs[[i]] * (1-B_PKs[[i]]) + (1-G_adjs[[i]]) * B_PKs[[i]]) - - - # BP(Isci) scoring - # if 1 ---> 1-B(i,j), ...., if 0 ----> 1 - E <- sum(1 - B_PKs[[i]] * G_adjs[[i]]) - - # Add non-blocked - E_non_blocked <- E_non_blocked + ncol(B_PKs[[i]]) * nrow(B_PKs[[i]]) - - } - - # in both scoring we add +1 for each blocked edge too - # to compute it we just compute non-blocked and then: - # [TOTAL] - [non-BLOCKED] = [BLOCKED] = N^2 - [non-BLOCKED] - E <- (E + N*N - E_non_blocked) - - return(E) -} - #' #' Compute a score~proportional to~ log P(G) for a given graph. This part of function should be called #' #' For an integral formula of the Bayesian prior see: #' https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957076/ #' -#' The formula used below is the closed-form solution -#' for definite integral with the given Beta_L, Beta_H. +#' The formula used below is the closed-form MAP solution with given Beta_L, Beta_H #' -#' @param E A previously computed energy U(G) for a graph -#' @param N A total number of genes -#' @param beta_L A lower threshold on prior's weight (numeric) -#' @param beta_H An upper threshold on prior's weight (numeric) -#' @returns score ~proportional to~ P(G), network prior for a given network -#' -pP_G_compute <- function(E, N, beta_L, beta_H) -{ - U <- E / (N*N) - delta_beta <- beta_H - beta_L - - ret <- exp(-U * beta_L) - exp(-U * beta_H) - ret <- ret / delta_beta - ret <- ret / U - ret <- log(ret) - return(ret) -} - -P_G_change <- function(part_id, from, to, action_is_add, N, energy_prev, B_PKs, beta_L, beta_H) +P_G_change <- function(part_id, from, to, action_is_add, prev_score, energy, B_PKs, beta_L, beta_H) { B_PK_mat <- B_PKs[[part_id]] b_i_j <- B_PK_mat[from, to] + #K <- sum(unlist(lapply(B_PKs, sum))) + K <- sum(unlist(lapply(B_PKs, length))) + + ret <- list(score=prev_score, diff=0, energy=energy) + #if(b_i_j != 1) return(ret) + + u_change <- 1 - 2 * b_i_j + u_change <- (1 - 2 * action_is_add) * u_change + + # + #cat("total energy:", energy, ", uchange:", u_change, "\n") + + total_energy <- energy + u_change + + beta <- optimum_beta(total_energy, K, beta_L, beta_H) + + + sP_G <- fast_point_log_P_G(beta, K, total_energy) + #sP_G <- marginalization_P_G(total_energy, K, beta_L, beta_H) + + ret <- list(score=sP_G, diff=sP_G - prev_score, energy=total_energy) + return(ret) + + + + if(u_change == 0) + { + return(ret) + } + + # IntOMICS change: @@ -147,19 +98,48 @@ P_G_change <- function(part_id, from, to, action_is_add, N, energy_prev, B_PKs, u_change <- (1 - 2 * action_is_add) * b_i_j U.new <- energy_prev + u_change - score <- pP_G_compute(U.new, N, beta_L, beta_H) + score <- pP_G_compute(U.new, N_sq, beta_L, beta_H) + + + diff <- score - score_prev + energies[part_id] <- U.new + scores[part_id] <- score + + ret <- list(score=sum(scores), diff=diff, energy=energy) - ret <- list(score=score, energy=U.new) return(ret) } +P_G_neighbours_change <- function(G_adjs, part_id, from, to, action_is_add) +{ + prev_number <- sum(G_adjs[[part_id]][,to]) # + 1 + new_number <- prev_number + (2 * action_is_add - 1) + + + possible_parents <- nrow(G_adjs[[part_id]]) + + #prev_number <- -log(prev_number) + #new_number <- -log(new_number) + + prev_number <- -f_log_combi(n=possible_parents, k=prev_number) + new_number <- -f_log_combi(n=possible_parents, k=new_number) + + change <- new_number - prev_number + return(change) +} #' -#' Compute a score-P(G) for a given graph, e.g., non-normalized P(G). -#' +#' Compute a score-P(G) for a given graph. Formula (by Isci): +#' E(i,j) = +#' (if edge in G) = 1 - B(i,j) +#' +#' (if edge not in G) = B(i,j) +#' +#' E(G) = sum(i,j) E(i,j) +#' #' For theoretical reference see: #' * IntOMICS: #' https://doi.org/10.1186/s12859-022-04891-9 @@ -174,14 +154,31 @@ P_G_change <- function(part_id, from, to, action_is_add, N, energy_prev, B_PKs, #' P_G <- function(G_adjs, B_PKs, beta_L, beta_H) { - # compute total number of genes - N <- ncol(G_adjs[[1]]) + sum(unlist(lapply(G_adjs, nrow))) - - # The computation is divided for further optimization: - # energy could be changed by a constant - E <- Energy_compute(G_adjs, B_PKs, N) - sP_G <- pP_G_compute(E, N, beta_L, beta_H) - return(list(score=sP_G, energy=E)) + total_energy <- 0 + K <- sum(unlist(lapply(B_PKs, sum))) + #K <- sum(unlist(lapply(B_PKs, length))) + for(i in 1:length(G_adjs)) # for each partition interaction separately + { + # IntOMICS score: + # if edge included ----> (1 - b_i_j) + # if edge no included -> (b_i_j) + adj <- G_adjs[[i]] + bpk <- B_PKs[[i]] + bpk <- bpk - 0.5 + bpk <- bpk * 2 + #local_energy <- bpk * (1 - adj) + (1 - bpk) * adj + local_energy <- bpk * (1 - adj) + local_energy <- sum(local_energy) + total_energy <- total_energy + local_energy + } + cat("total_energy: ", total_energy, "\n") + beta <- optimum_beta(total_energy, K, beta_L, beta_H) + + sP_G <- fast_point_log_P_G(beta, K, total_energy) + + #sP_G <- marginalization_P_G(total_energy, K, beta_L, beta_H) + + return(list(score=sP_G, energy=total_energy)) } #' @@ -214,6 +211,24 @@ P_D_G <- function(G_adjs, param_in) return(pDG1) } +P_G_neighbours <- function(G_adjs) +{ + score_ret <- sum(unlist(lapply(G_adjs, function(adj){ + + neights <- colSums(adj) + possible_parents <- nrow(adj) + + #neights <- neights + 1 # very small penalty + + neights <- sapply(neights, function(nt) -f_log_combi(n=possible_parents, k=nt)) + sum(neights) + + #-log(neights) + }))) + return(score_ret) +} + + #' #' Compute a score list: [log P(D|G), log P(G)] for a given graph, e.g., #' log P(G | D) = log P(D|G) + log P(G) @@ -233,8 +248,14 @@ score_G <- function(G_adjs, B_PKs, beta_L, beta_H, param_in) { pG_list <- P_G(G_adjs, B_PKs, beta_L, beta_H) pDG <- P_D_G(G_adjs, param_in) - #pGD <- pG * pDG - return(c("pG"=pG_list$score, "pDG"=pDG, "energy"=pG_list$energy, "pG_change"=0, "pDG_change"=0)) + pG_neight <- P_G_neighbours(G_adjs) + #pG_neight <- 0 + + cat(pG_list$score + pG_neight, ", ", pDG, "\n") + + return(list("pG"=pG_list$score + pG_neight, + "pDG"=pDG, "energy"=pG_list$energy, + "pG_change"=0, "pDG_change"=0)) } #' @@ -258,24 +279,26 @@ score_G <- function(G_adjs, B_PKs, beta_L, beta_H, param_in) #' @returns P(D | G) = BGe score of a given graph after change #' score_change_G <- function(G_adjs, B_PKs, beta_L, beta_H, param_in, - sizes, score_prev, part_id, from, to, action_is_add) { ret <- score_prev - N <- sum(sizes) - + # ! Convert C++ indices to Rcpp indices(+1) ! - AC <- P_G_change(part_id+1, from+1, to+1, action_is_add, N, ret["energy"], B_PKs, beta_L, beta_H) - B <- BGe_change(part_id+1, from+1, to+1, action_is_add, ret["pDG"], G_adjs, param_in) + AC <- P_G_change(part_id+1, from+1, to+1, action_is_add, score_prev$pG, ret$energy, B_PKs, beta_L, beta_H) + B <- BGe_change(part_id+1, from+1, to+1, action_is_add, ret$pDG, G_adjs, param_in) + + D <- P_G_neighbours_change(G_adjs, part_id+1, from+1, to+1, action_is_add) + #D <- 0 - prev <- ret - - ret["pG"] <- AC$score - ret["pDG"] <- B$score - ret["energy"] <- AC$energy - ret["pG_change"] <- AC$score - prev["pG"] - ret["pDG_change"] <- B$diff + ret <- list() + ret$pG <- AC$score + D + ret$pDG <- B$score + ret$energy <- AC$energy + ret$pG_change <- AC$diff + D + ret$pDG_change <- B$diff + #print(ret) + return(ret) } @@ -357,13 +380,15 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be # Extract every partition's GE,PK,SK parts PK_in <- list() SK_in <- list() - + PK_ref <- list() for(i in 2:length(parts)) { # PK subPK <- PK[parts[[i-1]],parts[[i]]] subPK <- as.matrix(subPK) + PK_ref[[i-1]] <- subPK + # convert PK to prior scores as in IntOMICS/Bayesian Network Prior(Isci) subPK <- subPK * 0.5 + 0.5 # ---> 1 = prior, 0.5 = no prior, but allowed edge # all non-allowed edges are removed explicitly by list @@ -376,35 +401,37 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be SK_in[[i-1]] <- subSK remove(subSK) } - + cat("PK sum: ", unlist(lapply(PK_in, sum)), "\n") # prepare BGe score parameters # this is a large computation that computes a large parameter matrix: # N x N, where N is the sum of sizes, e.g., total number of genes cat("Computing large BGe TN matrix...\n") - #myScore <- scoreparameters_BiDAG_BGe(n = ncol(GE), data = GE) - #param_in <- list(n=myScore$n, awpN=myScore$awpN, TN=myScore$TN, scoreconstvec=myScore$scoreconstvec) - #GLOBAL_PARAM_IN <<- param_in - param_in <- GLOBAL_PARAM_IN - #remove(myScore) + myScore <- scoreparameters_BiDAG_BGe(n = ncol(GE), data = GE) + param_in <- list(n=myScore$n, awpN=myScore$awpN, TN=myScore$TN, scoreconstvec=myScore$scoreconstvec) + remove(myScore) + #saveRDS(param_in, "param.RData") + #param_in <- readRDS("param.RData") cat("TN matrix have a size of: ", dim(param_in$TN), "\n") + + ref_score <- score_G(PK_ref, PK_in, beta_L, beta_H, param_in) + cat("Score of prior matrix: \n") + print(ref_score$pG + ref_score$pDG) + #remove(ref_score) + #remove(PK_ref) - set.seed(0) - - start.time <- Sys.time() - print(start.time) + cat("Running the SK-SS Rcpp part:\n") G_out <- c_SK_SS(PK_in,SK_in, sizes, param_in, - I, debug, beta_L, beta_H, + I, debug, beta_L, beta_H, score_G, score_change_G) + PK_out <- matrix(-Inf, nrow(PK), ncol(PK)) + rownames(PK_out) <- unlist(parts) + colnames(PK_out) <- unlist(parts) + PK_out[parts[[1]], parts[[2]]] <- G_out[[1]] + PK_out[parts[[2]], parts[[3]]] <- G_out[[2]] + PK_out <- exp(PK_out) + PK_out <- PK_out >= alpha - end.time <- Sys.time() - print(end.time) - print(end.time - start.time) - - print(G_out) - saveRDS(G_out, "G_out_1_iter.RData") - # TODO convert to NxN matrix form - - return(G_out) + return(list(PK_out=PK_out, PK_score=ref_score$pG + ref_score$pDG)) } diff --git a/source/skeleton.cpp b/source/skeleton.cpp index 7c6c3831888d2e4b4dce58f2dc60202c51af40a4..a42a04dd2b321bb1e28c505be02aabcba180be7d 100644 --- a/source/skeleton.cpp +++ b/source/skeleton.cpp @@ -9,6 +9,7 @@ #include <vector> #include <algorithm> #include <map> +#include <limits> // Pure C99 #include <cstring> @@ -21,6 +22,8 @@ * We include the external library of RcppArmadillo, * this allows for a future usage of linear algebra, advanced Rcpp, etc... * + * Next, we include the external library of RcppParallel for low-level C++ multithreading + * * NOT USED AS FOR NOW */ // [[Rcpp::depends(RcppArmadillo)]] @@ -28,6 +31,9 @@ #include <RcppArmadilloExtensions/sample.h> //#include <Rcpp.h> +// [[Rcpp::depends(RcppParallel)]] +#include <RcppParallel.h> + /** * Most used Rcpp objects, used to remove Rcpp prefix */ @@ -44,11 +50,37 @@ using Rcpp::Function; using Rcpp::IntegerVector; using Rcpp::_; +inline static double RNegInf = -std::numeric_limits<double>::infinity(); + /** * Also -- include the "saveRDS" function for partial result serialization and save */ Rcpp::Environment base("package:base"); +//Rcpp::Environment mStats("package:matrixStats"); + Function saveRDS = base["saveRDS"]; +Function sumMat = base["sum"]; + +//Function rowLogSumExps = mStats["rowLogSumExps"]; + +/** + * A Log-Sum-Exp trick for two very low log values to prevent overflow/underflow + */ +double logSumExp(double x, double y) +{ + double mx = (x<y)?y:x; + + double xr = x - mx; + double yr = y - mx; + + + return mx + log( + exp(xr) + exp(yr) + ); + +} + + /** @@ -97,16 +129,36 @@ struct key_comp return std::get<1>(k1) < std::get<1>(k2); } }; +struct key_eq +{ + inline bool operator() (const key_tp& k1, const key_tp& k2) + { + if(std::get<0>(k1) != std::get<0>(k2)) return false; + if(std::get<2>(k1) != std::get<2>(k2)) return false; + return std::get<1>(k1) == std::get<1>(k2); + } +}; -void insert_sorted( std::vector<key_tp> & vec, key_tp const& item ) +void insert_sorted( std::vector<key_tp> & vec, key_tp item ) { - vec.insert - ( - std::upper_bound( vec.begin(), vec.end(), item, key_comp()), - item - ); + for(uint32_t i = 0; i < vec.size(); i++) + { + if(key_comp()(vec[i], item)) + { + continue; + } + if(key_eq()(vec[i], item)) + { + Rcout << "ERROR! You are trying to insert a key that is already inside! Check your code!\n"; + return; + } + vec.insert(vec.begin() + i, item); + return; + } + vec.push_back(item); + return; } -void erase_sorted(std::vector<key_tp> & vec, key_tp const& item ) +void erase_sorted(std::vector<key_tp> & vec, key_tp item ) { vec.erase(std::remove(vec.begin(), vec.end(), item), vec.end()); } @@ -121,31 +173,6 @@ enum ChangeType // ########################################### COMPRESSED TRIE CODE ############################################################### -/** - * Apply the key_tp's triplet index as TRUE on adjacency matrix - */ -inline static void G_forward(const key_tp &key, std::vector<LogicalMatrix*>& G_adjs) -{ - const uint8_t one = std::get<0>(key); - const uint16_t two = std::get<1>(key); - const uint16_t three = std::get<2>(key); - - - (*G_adjs[one])(two, three) = true; -} - -/** - * Apply the key_tp's triplet index as FALSE on adjacency matrix - */ -inline static void G_backward(const key_tp &key, std::vector<LogicalMatrix*>& G_adjs) -{ - const uint8_t one = std::get<0>(key); - const uint16_t two = std::get<1>(key); - const uint16_t three = std::get<2>(key); - - (*G_adjs[one])(two, three) = true; -} - struct Node { bool is_graph = false; @@ -173,7 +200,35 @@ struct Node } }; -#define ALLOC_SIZE ((2UL << 13)) +inline static void G_change(Node* node, std::vector<LogicalMatrix*>& G_adjs, bool value) +{ + for(uint32_t i = 0; i < node->num_values; i++) + { + key_tp key = node->values[i]; + const uint8_t one = std::get<0>(key); + const uint16_t two = std::get<1>(key); + const uint16_t three = std::get<2>(key); + (*G_adjs[one])(two, three) = value; + } +} + +/** + * Apply the Node's key_tp triplet index as TRUE on adjacency matrix + */ +inline static void G_forward(Node* node, std::vector<LogicalMatrix*>& G_adjs) +{ + G_change(node, G_adjs, true); +} +/** + * Apply the key_tp's triplet index as FALSE on adjacency matrix + */ +inline static void G_backward(Node* node, std::vector<LogicalMatrix*>& G_adjs) +{ + G_change(node, G_adjs, false); +} + + +#define ALLOC_SIZE ((5UL << 13)) Node PREALLOCATED_POOL[ALLOC_SIZE]; /** @@ -191,7 +246,7 @@ public: { if (this->current_index >= ALLOC_SIZE) { - Rprintf("Maximum number of nodes (%5lu) reached! Now allocating one by one!", ALLOC_SIZE); + //Rprintf("Maximum number of nodes (%5lu) reached! Now allocating one by one!", ALLOC_SIZE, "\n"); Node* ret = new Node; return ret; } @@ -325,6 +380,7 @@ bool CompressedTrie::appendRecursive(key_tp* word, uint32_t len) // append new node with entire word Node* appended = this->createNode(word, len); currNode->next_map[currFirstLetter] = appended; + appended->is_graph = true; return true; } @@ -337,13 +393,13 @@ bool CompressedTrie::appendRecursive(key_tp* word, uint32_t len) if(moveIndex < cand->num_values) { subdivideNode(cand, moveIndex); - } // If node is our entire word -> mark and end if(moveIndex == len) { + bool ret = cand->is_graph; cand->is_graph = true; - return true; + return ret; } // go deeper @@ -442,7 +498,12 @@ void CompressedTrie::appendWordToNode(Node* toAppend, key_tp* word, uint32_t len void CompressedTrie::SampleAdjacencyMatrix(std::vector<LogicalMatrix*>& matrix) { + if(this->total_num_graphs == 0) + { + return; + } this->current_needed_index = this->SampleLong() % this->total_num_graphs; + this->current_needed_index++; this->current_visited_index = 0; this->current_G_adjs = &matrix; @@ -463,7 +524,7 @@ void CompressedTrie::findRecursive(Node *currNode) const auto &key = item.first; const auto &value = item.second; - G_forward(key, *this->current_G_adjs); + G_forward(value, *this->current_G_adjs); this->current_visited_index += value->is_graph; @@ -474,7 +535,7 @@ void CompressedTrie::findRecursive(Node *currNode) return; } - G_backward(key, *this->current_G_adjs); + G_backward(value, *this->current_G_adjs); } return; @@ -514,9 +575,10 @@ struct ISettings bool debug; double beta_L; double beta_H; + //double cutoff_ratio; }; -#define SAVE_FREQ 1000 +#define SAVE_FREQ 500 #define PRINT_FREQ 10000 #define PARTIAL_ADD_ONLY "[pSK-SS][add only]" #define PARTIAL_ADD_REMOVE "[pSK-SS][add-remove]" @@ -535,8 +597,7 @@ protected: */ std::vector<key_tp> ord_V_G; - std::vector<key_action_tp> unord_V_G; - + /** * store the current adjacency graph and the best MAP model found during the search * > update only when better model is found @@ -544,10 +605,19 @@ protected: std::vector<LogicalMatrix*> G; std::vector<LogicalMatrix*> G_sampled; - double probabilityMaxOptim = -99999999999.0; - std::vector<LogicalMatrix*> G_optim; + std::vector<double> acceptedScores; + std::vector<double> acceptedScoresPrior; + + uint32_t current_id = 0; + + //double probabilityMaxOptim = -99999999999.0; + //std::vector<LogicalMatrix*> G_optim; + + std::vector<NumericMatrix*> G_posterior; + double logSumScores = RNegInf; + public: Network(IData input, ISettings settings) { @@ -568,61 +638,103 @@ public: { Rcout << "partialSK-SS, Iteration: " << i << "\n"; this->emptyCurrCountsAndG(); + + //Rcout << sumMat(*this->G[1]) << " x " << sumMat(*this->G_sampled[2]) << "\n"; + this->partialSK_SS_iter(); + this->ord_V_G.clear(); } } void SK_SS() { this->partialSK_SS(); - /* - uint32_t sampled_subgraphs = this->unord_V_G.size(); - IntegerVector indices = Rcpp::sample(sampled_subgraphs, this->settings.I); - indices.insert(indices.begin(), 1); - std::sort(indices.begin(), indices.end());*/ + + std::vector< + std::vector<LogicalMatrix*> + > all_G_sampled; for(uint32_t i = 0; i < this->settings.I; i++) { - Rcout << "SK-SS, Iteration: " << i << "\n"; - /* - for(uint32_t j = indices[i]; j < indices[i+1]; j++) + std::vector<LogicalMatrix*> to_sample = std::vector<LogicalMatrix*>(); + const List* SK_list = this->input_data.SK; + + for(uint32_t i = 0; i < SK_list->length(); i++) { - this->unord_V_G[j-1].forward(this->G_sampled); - }*/ - this->V_G.SampleAdjacencyMatrix(this->G_sampled); + const IntegerMatrix& sk = (*SK_list)[i]; + to_sample.push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(to_sample[i])) = Rcpp::rownames(sk); + Rcpp::colnames(*(to_sample[i])) = Rcpp::colnames(sk); + } + this->V_G.SampleAdjacencyMatrix(to_sample); + all_G_sampled.push_back(to_sample); + } + + for(uint32_t i = 0; i < this->settings.I; i++) + { + Rcout << "SK-SS, Iteration: " << i << "\n"; + this->ord_V_G.clear(); + //this->resetSampledG(); + this->G_sampled = all_G_sampled[i]; this->emptyCurrCountsAndG(); this->SK_SS_phase2_iter(); } + + Rprintf("End of SK_SS() function here\n"); } List& result() { - Rprintf("Optimum log probability: %.10f\n", this->probabilityMaxOptim); + //Rprintf("Optimum log probability: %.10f\n", this->probabilityMaxOptim); const List* SK_list = this->input_data.SK; List* ret = new List(SK_list->length()); for(uint32_t i = 0; i < SK_list->length(); i++) { - (*ret)[i] = *this->G_optim[i]; + /* + Rcout << (*this->G_posterior[i])(1, 7) << "," + << (*this->G_posterior[i])(3, 12) << " - " + << this->logSumScores << "\n";*/ + (*ret)[i] = *this->G_posterior[i] - this->logSumScores; } + Rcout << "logSumScores: " << this->logSumScores << "\n"; + return *ret; } protected: void SK_SS_phase2_iter() { - NumericVector twoScores = this->callScoreFull(); + Rprintf("phase 2\n"); + List twoScores = this->callScoreFull(); const List* SK_list = this->input_data.SK; IntegerVector riter = Rcpp::sample( - this->input_data.N, this->input_data.N); + this->input_data.N, (this->input_data.N)); - for(uint32_t i = 0; i < this->input_data.N; i++) + saveRDS(riter, Rcpp::Named("file", "sampled_indices.RData")); + + + double bge = twoScores["pDG"]; + double energy = twoScores["pG"]; + double probabilityMovedG = bge + energy; + + /* + if(probabilityMovedG > this->probabilityMaxOptim) + { + Rcout << "YOU IDIOT, YOU HAVE BROKEN SAMPLING!\n"; + Rprintf("%.9f x %.9f\n", (probabilityMovedG), this->probabilityMaxOptim); + }*/ + + + for(uint32_t i = 0; i < riter.size(); i++) { + this->current_id = i; + uint32_t flatindex = riter[i] - 1; uint32_t lindex = 0; uint32_t xyindex = flatindex; @@ -633,7 +745,7 @@ protected: uint32_t sizeMatrix = (*this->input_data.sizes)[lindex] * (*this->input_data.sizes)[lindex + 1]; sumSizes += sizeMatrix; - if(sumSizes >= flatindex) + if(sumSizes > flatindex) { break; } @@ -645,7 +757,7 @@ protected: uint32_t r = xyindex % sk.nrow(); uint32_t c = xyindex / sk.nrow(); - if(i % PRINT_FREQ == 0) + if(i % PRINT_FREQ == 0) // i % PRINT_FREQ == 0 { this->DebugInfo(SKSS_PHASE, i); } @@ -662,23 +774,31 @@ protected: // try add an edge outside of G_0 // do not update V_G since we want to sample from partialSK-SS only! // because of that -> shouldUpdateTrie = false - this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::ADD, false); + //this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::ADD, false); + this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::ADD, true); + } + Rprintf("end of phase 2\n"); } void partialSK_SS_iter() { - NumericVector twoScores = this->callScoreFull(); + List twoScores = this->callScoreFull(); const List* SK_list = this->input_data.SK; + const List* PK_list = this->input_data.PK; IntegerVector riter = Rcpp::sample( this->input_data.N, this->input_data.N); + //saveRDS(riter, Rcpp::Named("file", "sampled_indices.RData")); Rprintf("partialSKSS-add-only phase\n"); + for(uint32_t i = 0; i < this->input_data.N; i++) { + this->current_id = i; + uint32_t flatindex = riter[i] - 1; uint32_t lindex = 0; uint32_t xyindex = flatindex; @@ -690,7 +810,7 @@ protected: sumSizes += sizeMatrix; - if(sumSizes >= flatindex) + if(sumSizes > flatindex) { break; } @@ -699,6 +819,7 @@ protected: } const IntegerMatrix& sk = (*SK_list)[lindex]; + const IntegerMatrix& pk = (*PK_list)[lindex]; uint32_t r = xyindex % sk.nrow(); uint32_t c = xyindex / sk.nrow(); @@ -706,10 +827,15 @@ protected: { this->DebugInfo(PARTIAL_ADD_ONLY, i); } - if(sk(r, c) == 0) + // if((sk(r, c) == 0) && + // (pk(r, c) != 1)) + if((sk(r, c) == 0)) { continue; } + if(i % SAVE_FREQ == 0) + this->SaveGraph(); + this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::ADD); } @@ -731,7 +857,7 @@ protected: sumSizes += sizeMatrix; - if(sumSizes >= flatindex) + if(sumSizes > flatindex) { break; } @@ -740,17 +866,20 @@ protected: } const IntegerMatrix& sk = (*SK_list)[lindex]; + const IntegerMatrix& pk = (*PK_list)[lindex]; uint32_t r = xyindex % sk.nrow(); uint32_t c = xyindex / sk.nrow(); - if(i % PRINT_FREQ == 0) + if(i % (PRINT_FREQ * 1000) == 0) { this->DebugInfo(PARTIAL_ADD_REMOVE, i); } if(i % SAVE_FREQ == 0) this->SaveGraph(); - if(sk(r, c) == 0) + // if((sk(r, c) == 0) && + // (pk(r, c) != 1)) + if((sk(r, c) == 0)) { continue; } @@ -766,10 +895,11 @@ protected: void DebugInfo(std::string phase, uint32_t iter) { - Rprintf("%s[%5u]/[%5u][Trie:%5u|%5u][Score:%9.6f]\n", + Rprintf("%s[%5u]/[%5u][Trie:%5u|%5u]\n", //[Score:%9.6f] phase.c_str(), iter, this->input_data.N, - this->V_G.num_graphs(), this->V_G.num_nodes(), - this->probabilityMaxOptim); + this->V_G.num_graphs(), this->V_G.num_nodes() + //,this->probabilityMaxOptim + ); } List& G_to_list() @@ -779,7 +909,6 @@ protected: for(uint32_t i = 0; i < SK_list->length(); i++) { - const IntegerMatrix& sk = (*SK_list)[i]; (*ret)[i] = *this->G[i]; } @@ -793,13 +922,29 @@ protected: for(uint32_t i = 0; i < SK_list->length(); i++) { - const IntegerMatrix& sk = (*SK_list)[i]; - (*ret)[i] = *this->G_optim[i]; + // Rcout << (*this->G_posterior[i])(1, 7) << "," + // << (*this->G_posterior[i])(3, 12) << " - " + // << this->logSumScores << "\n"; + (*ret)[i] = *this->G_posterior[i];// - this->logSumScores; } return *ret; } + void resetSampledG() + { + const List* SK_list = this->input_data.SK; + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + const IntegerMatrix& sk = (*SK_list)[i]; + + *(this->G_sampled)[i] = LogicalMatrix(sk.nrow(), sk.ncol()); + Rcpp::rownames(*(this->G_sampled)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G_sampled)[i]) = Rcpp::colnames(sk); + } + } + void emptyCurrCountsAndG() { const List* SK_list = this->input_data.SK; @@ -808,8 +953,9 @@ protected: { const IntegerMatrix& sk = (*SK_list)[i]; - //*(this->G)[i] = LogicalMatrix(sk.nrow(), sk.ncol()); - *(this->G)[i] = *(this->G_sampled)[i]; + *(this->G)[i] = Rcpp::LogicalMatrix( + Rcpp::clone(*this->G_sampled[i]) + ); Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); } @@ -833,13 +979,18 @@ protected: Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); - (this->G_optim).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); - Rcpp::rownames(*(this->G_optim)[i]) = Rcpp::rownames(sk); - Rcpp::colnames(*(this->G_optim)[i]) = Rcpp::colnames(sk); + //(this->G_optim).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); + //Rcpp::rownames(*(this->G_optim)[i]) = Rcpp::rownames(sk); + //Rcpp::colnames(*(this->G_optim)[i]) = Rcpp::colnames(sk); (this->G_sampled).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); Rcpp::rownames(*(this->G_sampled)[i]) = Rcpp::rownames(sk); Rcpp::colnames(*(this->G_sampled)[i]) = Rcpp::colnames(sk); + + (this->G_posterior).push_back(new NumericMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->G_posterior)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G_posterior)[i]) = Rcpp::colnames(sk); + *(this->G_posterior)[i] = *(this->G_posterior)[i] + RNegInf; } } @@ -853,19 +1004,21 @@ protected: for(uint32_t i = 0; i < SK_list->length(); i++) { delete this->G[i]; - delete this->G_optim[i]; + //delete this->G_optim[i]; + delete this->G_posterior[i]; } this->G.clear(); - this->G_optim.clear(); + //this->G_optim.clear(); + this->G_posterior.clear(); } - void tryPerformMove(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action, + void tryPerformMove(List* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action, bool shouldUpdateTrie = true) { // Compute the possible score - NumericVector scoreChanges = this->callScoreChange(score, listID, from, to, action); + List scoreChanges = this->callScoreChange(score, listID, from, to, action); /** * Modified(explicit per-partes) is used, e.g.: @@ -876,40 +1029,62 @@ protected: * > the previous is because they have orders of magnitude differences * > ONLY afterwards the summed difference is converted by exp, since it is tractable! */ - double mod_diff = (scoreChanges[3] + scoreChanges[4]); + double prior_diff = scoreChanges["pG_change"]; + double bge_diff = scoreChanges["pDG_change"]; + double mod_diff = (prior_diff + bge_diff);\ double diff = exp(mod_diff); - + // Convert the log difference to B/A and normalize it // normalize(1, B/A) = normalize(A,B) // that way we get probabilities for two candidates! NumericVector probs = {1.0, diff}; probs = probs / Rcpp::sum(probs); + + //Rcout << "probs: " << probs[0] << "," << probs[1] << "\n"; + IntegerVector ids = {0, 1}; IntegerVector ret = Rcpp::sample(ids, 1, probs = probs); // If the first was sampled, skip if(ret[0] != 1) return; - // otherwise update current state: - // Update score *score = scoreChanges; + this->UpdateVisited(scoreChanges); + // Update G if not in SK-SS part - this->makeMove(listID, from, to, action, shouldUpdateTrie); + bool isNew = this->makeMove(listID, from, to, action, shouldUpdateTrie); + + if(isNew){ + // Update posterior sum + this->UpdatePosterior(scoreChanges); + } // Update optimum - this->CheckVisitedGraphOptimum(scoreChanges); + //this->CheckVisitedGraphOptimum(scoreChanges); } + void UpdateVisited(List& scoreChanges) + { + double energy = scoreChanges["pG"]; + double bge = scoreChanges["pDG"]; + double probabilityMovedG = bge + energy; + this->acceptedScores.push_back(bge); + this->acceptedScoresPrior.push_back(energy); + } - - void makeMove(uint32_t listID, uint32_t from, uint32_t to, ChangeType action, bool shouldUpdateTrie) + bool makeMove(uint32_t listID, uint32_t from, uint32_t to, ChangeType action, bool shouldUpdateTrie) { // update adjacency (*(this->G[listID]))(from,to) = action == ChangeType::ADD; + // If SK-SS part -> don't update Trie + if(!shouldUpdateTrie) + { + return true; + } // Update our ordered add-only word key_tp kt = std::make_tuple((uint8_t)listID, (uint16_t)from, (uint16_t)to); @@ -921,70 +1096,128 @@ protected: erase_sorted(this->ord_V_G, kt); } - // If SK-SS part -> don't update Trie - if(!shouldUpdateTrie) - { - return; - } - // Try to add word to Trie - this->V_G.AppendChain(this->ord_V_G); - - key_action_tp to_push; - to_push.type = action; - to_push.coord = kt; - //this->unord_V_G.push_back(to_push); + return this->V_G.AppendChain(this->ord_V_G); } void SaveGraph() { - /* - if(this->probabilityMaxOptim <= -80000000000.0) - { - return; - }*/ + // Save intermediate results - Rprintf("Saving intermediate results [%9.6f]\n", this->probabilityMaxOptim); - List& tmp_adjs = this->G_to_list(); + //Rprintf("Saving intermediate results [%9.6f]\n", this->logSumScores); + + // Save optimum + List& tmp_adjs = this->G_optim_to_list(); char fileName[50]; - sprintf(fileName, "optimG_%9.6f.RData", this->probabilityMaxOptim); + char fileName2[50]; + //sprintf(fileName, "optimG_%9.6f.RData", this->probabilityMaxOptim); + //sprintf(fileName2, "optimG_%9.6f_copy.RData", this->probabilityMaxOptim); + sprintf(fileName, "G_%d.RData", this->current_id); + saveRDS(tmp_adjs, Rcpp::Named("file", fileName)); + //saveRDS(tmp_adjs, Rcpp::Named("file", "G_posterior.RDS")); + //saveRDS(tmp_adjs, Rcpp::Named("file", "G_posterior_copy.RDS")); + + // Save current + tmp_adjs = this->G_to_list(); + List ret = List(2); + ret[0] = this->current_id; + ret[1] = tmp_adjs; + //saveRDS(ret, Rcpp::Named("file", "G_last.RData")); + + // Save current + + char fileName3[50]; + tmp_adjs = this->G_to_list(); + sprintf(fileName3, "G_%d.RData", this->current_id); + //saveRDS(ret, Rcpp::Named("file", fileName3)); + + // Save visited scores and reset them + ret = List(2); + ret[0] = Rcpp::wrap(acceptedScores); + ret[1] = Rcpp::wrap(acceptedScoresPrior); + saveRDS(ret, Rcpp::Named("file", "accepted_scores.RData")); + //saveRDS(ret, Rcpp::Named("file", "accepted_scores_copy.RData")); + } + + + void UpdatePosterior(List& scoreChanges) + { + double energy = scoreChanges["pG"]; + double bge = scoreChanges["pDG"]; + double probabilityMovedG = bge + energy; + //Rcout << "Previous logSumScores: " << this->logSumScores << ", adding (logSumExp): " << probabilityMovedG << "\n"; + this->logSumScores = logSumExp(logSumScores, probabilityMovedG); + + uint32_t tCounter = 0; + for(uint32_t lindex = 0; lindex < (this->input_data.SK)->length(); lindex++) + { + const IntegerMatrix& sk = (*this->input_data.SK)[lindex]; + for(uint32_t r = 0; r < sk.nrow(); r++) + { + for(uint32_t c = 0; c < sk.ncol(); c++) + { + bool ret = (*this->G[lindex])(r,c); + if(!ret) continue; + + tCounter++; + double prev = (*this->G_posterior[lindex])(r,c); + double newval = logSumExp(prev, probabilityMovedG); + + (*this->G_posterior[lindex])(r,c) = newval; + } + } + } + + if(this->current_id % 100 == 0) + { + //Rcout << "G_adjs have " << tCounter << " edges\n"; + } + } - void CheckVisitedGraphOptimum(NumericVector& scoreChanges) + + void CheckVisitedGraphOptimum(List& scoreChanges) { - double bge = scoreChanges[1]; - double energy = scoreChanges[0]; + double energy = scoreChanges["pG"]; + double bge = scoreChanges["pDG"]; + + double probabilityMovedG = bge + energy; + this->acceptedScores.push_back(bge); + this->acceptedScoresPrior.push_back(energy); + /* if(probabilityMovedG <= this->probabilityMaxOptim) { return; } - + //Rcout << "New optim found! " << this->probabilityMaxOptim << "x" << probabilityMovedG << "\n"; + //Rcout << sum(NumericMatrix(*this->G_optim[0])) << " x " << sum(NumericMatrix(*this->G[0])) << "\n"; this->probabilityMaxOptim = probabilityMovedG; const List* SK_list = this->input_data.SK; for(uint32_t i = 0; i < SK_list->length(); i++) { - *this->G_optim[i] = *this->G[i]; - } + *this->G_optim[i] = Rcpp::LogicalMatrix( + Rcpp::clone(*this->G[i]) + ); + }*/ } - NumericVector callScoreChange(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) + List callScoreChange(List* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) { List& adjs = this->G_to_list(); - NumericVector scoreChanges = (*this->input_data.score_change_func) + List scoreChanges = (*this->input_data.score_change_func) ( adjs, *this->input_data.PK, this->settings.beta_L, this->settings.beta_H, *this->input_data.bge_param, - *this->input_data.sizes, *score, listID, from, @@ -992,13 +1225,14 @@ protected: action == ChangeType::ADD); delete &adjs; + return scoreChanges; } - NumericVector callScoreFull() + List callScoreFull() { List& adjs = this->G_to_list(); - NumericVector twoScores = (*this->input_data.score_full_func)( + List twoScores = (*this->input_data.score_full_func)( adjs, *this->input_data.PK, this->settings.beta_L, @@ -1008,9 +1242,6 @@ protected: delete &adjs; return twoScores; } - - - }; /** @@ -1033,18 +1264,54 @@ List c_SK_SS( */ //#include "skeleton.h" +// [[Rcpp::export]] +int test(const List& SK, const List& PK, double val) +{ + + + + + + + + uint32_t N = PK.length(); + + uint32_t countOnes = 0; + + for(uint32_t i = 0; i < N; i++) + { + const IntegerMatrix& pk = (PK)[i]; + const IntegerMatrix& sk = (SK)[i]; + for(uint32_t r = 0; r < pk.nrow(); r++) + { + for(uint32_t c = 0; c < pk.ncol(); c++) + { + if((sk(r, c) == 0) && + (pk(r, c) != 1)) + { + continue; + } + countOnes++; + } + } + } + return countOnes; +} + + + // [[Rcpp::export]] List c_SK_SS( - const List& PK, - const List& SK, - const IntegerVector& sizes, - const List& param, - uint32_t I, - bool debug, - double beta_L, double beta_H, - Function score_full_func, - Function score_change_func - ) + const List& PK, + const List& SK, + const IntegerVector& sizes, + const List& param, + uint32_t I, + bool debug, + double beta_L, double beta_H, + Function score_full_func, + Function score_change_func +) { uint32_t N = 0; for(uint32_t i = 0; i < (sizes.size() - 1); i++) @@ -1070,7 +1337,6 @@ List c_SK_SS( settings.debug = debug; settings.beta_H = beta_H; settings.beta_L = beta_L; - Network* net = new Network(i_data, settings); diff --git a/source/skeleton_map.cpp b/source/skeleton_map.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cf01e835394c4e50e31bda9cc420f1f98a54c403 --- /dev/null +++ b/source/skeleton_map.cpp @@ -0,0 +1,1171 @@ +/** + * This part will be the header file declarations in package, but <Rcpp::sourceCpp()> requires single file to work, so it is here for now + */ + +// Pure C++ +#include <numeric> +#include <bitset> +#include <string> +#include <vector> +#include <algorithm> +#include <map> + +// Pure C99 +#include <cstring> +#include <cmath> +#include <limits.h> +#include <cstdio> +#include <cstdint> + +/** + * We include the external library of RcppArmadillo, + * this allows for a future usage of linear algebra, advanced Rcpp, etc... + * + * NOT USED AS FOR NOW + */ +// [[Rcpp::depends(RcppArmadillo)]] +#include <RcppArmadillo.h> +#include <RcppArmadilloExtensions/sample.h> +//#include <Rcpp.h> + +/** + * Most used Rcpp objects, used to remove Rcpp prefix + */ +using Rcpp::Rcout; +using Rcpp::Rcerr; +using Rcpp::NumericVector; +using Rcpp::NumericMatrix; +using Rcpp::DataFrame; +using Rcpp::Nullable; +using Rcpp::LogicalMatrix; +using Rcpp::IntegerMatrix; +using Rcpp::List; +using Rcpp::Function; +using Rcpp::IntegerVector; +using Rcpp::_; + +/** + * Also -- include the "saveRDS" function for partial result serialization and save + */ +Rcpp::Environment base("package:base"); +Function saveRDS = base["saveRDS"]; + + +/** + * This part will be the separate header file for tuple + hash + Trie data structures + */ +#include <tuple> + +/** + * We will encode a 3D coordinate: + * > listID --> ID of the partition of our N-partition graph, goes: (0, N-2) + * Note: because, for example, for 3 partition(A,B,C), we have 2 adjacency matrices: AB, BC + * > FROM, TO --> row, column ID of the particular + */ +typedef std::tuple<uint8_t, uint16_t, uint16_t> key_tp; + +/** + * Hashing of the triplet. Uses the simple boost libraries' implementation of tuple hash + * + * https://stackoverflow.com/questions/7110301/generic-hash-for-tuples-in-unordered-map-unordered-set + */ +struct key_hash : public std::unary_function<key_tp, std::size_t> +{ + std::size_t operator()(const key_tp& k) const + { + std::size_t ret = 0; + ret ^= std::hash<uint8_t>()(std::get<0>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2); + ret ^= std::hash<uint16_t>()(std::get<1>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2); + ret ^= std::hash<uint16_t>()(std::get<2>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2); + + return ret; + } +}; + +struct key_comp +{ + /** + * Java/C++ like comparator function: + * + * Comparison is performed in the order: + * listID(0) -> to(2) -> from(1) + */ + inline bool operator() (const key_tp& k1, const key_tp& k2) + { + if(std::get<0>(k1) != std::get<0>(k2)) return std::get<0>(k1) < std::get<0>(k2); + if(std::get<2>(k1) != std::get<2>(k2)) return std::get<2>(k1) < std::get<2>(k2); + return std::get<1>(k1) < std::get<1>(k2); + } +}; +struct key_eq +{ + inline bool operator() (const key_tp& k1, const key_tp& k2) + { + if(std::get<0>(k1) != std::get<0>(k2)) return false; + if(std::get<2>(k1) != std::get<2>(k2)) return false; + return std::get<1>(k1) == std::get<1>(k2); + } +}; + +void insert_sorted( std::vector<key_tp> & vec, key_tp item ) +{ + for(uint32_t i = 0; i < vec.size(); i++) + { + if(key_comp()(vec[i], item)) + { + continue; + } + if(key_eq()(vec[i], item)) + { + Rcout << "ERROR! You are trying to insert a key that is already inside! Check your code!\n"; + return; + } + vec.insert(vec.begin() + i, item); + return; + } + vec.push_back(item); + return; +} +void erase_sorted(std::vector<key_tp> & vec, key_tp item ) +{ + vec.erase(std::remove(vec.begin(), vec.end(), item), vec.end()); +} + + +enum ChangeType +{ + ADD = true, + DELETE = false +}; + + + +// ########################################### COMPRESSED TRIE CODE ############################################################### +struct Node +{ + bool is_graph = false; + key_tp *values = nullptr; + uint32_t num_values = 0; + + std::unordered_map< + key_tp, + Node *, + key_hash> + next_map; + + void print() + { + std::cout << "["; + for(uint32_t i = 0; i < num_values; i++) + { + key_tp k = values[i]; + std::cout << "(" + << (uint32_t)std::get<0>(k) << "," + << std::get<1>(k) << "," + << std::get<2>(k) << ") "; + } + std::cout << "] "; + } +}; + +inline static void G_change(Node* node, std::vector<LogicalMatrix*>& G_adjs, bool value) +{ + for(uint32_t i = 0; i < node->num_values; i++) + { + key_tp key = node->values[i]; + const uint8_t one = std::get<0>(key); + const uint16_t two = std::get<1>(key); + const uint16_t three = std::get<2>(key); + (*G_adjs[one])(two, three) = value; + } +} + +/** + * Apply the Node's key_tp triplet index as TRUE on adjacency matrix + */ +inline static void G_forward(Node* node, std::vector<LogicalMatrix*>& G_adjs) +{ + G_change(node, G_adjs, true); +} +/** + * Apply the key_tp's triplet index as FALSE on adjacency matrix + */ +inline static void G_backward(Node* node, std::vector<LogicalMatrix*>& G_adjs) +{ + G_change(node, G_adjs, false); +} + + +#define ALLOC_SIZE ((5UL << 13)) +Node PREALLOCATED_POOL[ALLOC_SIZE]; + +/** + * Optimization technique -- allocate a large portion of nodes in advance on stack, + * so that the OS won't spend any additional time allocating nodes one by one. + */ +class NodePool +{ +public: + NodePool() + { + } + + Node *get() + { + if (this->current_index >= ALLOC_SIZE) + { + //Rprintf("Maximum number of nodes (%5lu) reached! Now allocating one by one!", ALLOC_SIZE, "\n"); + Node* ret = new Node; + return ret; + } + Node *ret = PREALLOCATED_POOL + this->current_index; + this->current_index++; + return ret; + } + + uint64_t size() + { + return this->current_index; + } + +protected: + uint64_t current_index = 0; +}; + +/** + * After Trie receives a word, only the end of it is actually a subgraph, + * + * Example to illustrate the intermediate subgraph mechanics: + * add 2 -> add 1 -> add 5 -> add 4 -> delete 2 -> add 3 + * + * then, the visited subgraphs are: + * {[2], [1, 2], [1, 2, 5], [1, 2, 4, 5], [1, 4, 5], [1, 3, 4, 5]} + * + * + * which results in Trie (Which is then compressed): + * + * 2 + * root---| 5 + * | 2--| + * 1*--| 4*---5 + * | + * 4*--5 + * | + * 3*--4*--5 + * + * + * ! Note -> nodes with * haven't been visited and, thus, not subgraphs ! + * Compressed trie example: + * https://www.geeksforgeeks.org/compressed-tries/ + * + * + * > if no flag is used to determine the node in Trie, we can't mark the above + * > when pushing a word, only its last tail node will be marked + */ +class CompressedTrie +{ +public: + CompressedTrie() {} + ~CompressedTrie() {} + + /** + * Append a word consisting of sorted ADD only nodes. + * return TRUE only if non-visited subgraph added + */ + bool AppendChain(std::vector<key_tp>& word); + + /** + * Print current Trie, used only for debug purposes + */ + void PrintTrie(); + + /** + * Randomly chooses a graph from the Trie and sets the matrix(adjacency) + * to it + */ + void SampleAdjacencyMatrix(std::vector<LogicalMatrix*>& matrix); + + /** + * Return the number of nodes in a graph + */ + uint64_t num_nodes() {return this->node_pool.size();} + + /** + * Return the number of nodes that are actually subgraphs + */ + uint32_t num_graphs() {return this->total_num_graphs;} + +protected: + NodePool node_pool; + Node *root = this->node_pool.get(); + + uint32_t total_num_graphs = 0; + + uint32_t current_needed_index; + uint32_t current_visited_index; + std::vector<LogicalMatrix*>* current_G_adjs; + void findRecursive(Node *currNode); + + void appendWordToNode(Node* toAppend, key_tp* word, uint32_t len); // TODO remove, not used when no delete + bool appendRecursive(key_tp* word, uint32_t len); + uint32_t compareValues(Node* toAppend, key_tp* word, uint32_t len); + Node* createNode(key_tp* word, uint32_t len); + void subdivideNode(Node* node, uint32_t divideIndex); + + uint64_t SampleLong() + { + uint32_t sampledIndex = Rcpp::sample(this->total_num_graphs, 1)[0] - 1; + uint32_t sampledIndex2 = Rcpp::sample(this->total_num_graphs, 1)[0] - 1; + uint64_t ret = sampledIndex; + ret = ret << 32; + ret = ret + sampledIndex2; + return ret; + } +}; + + + +bool CompressedTrie::AppendChain(std::vector<key_tp>& word) +{ + bool ret = this->appendRecursive(&word[0], word.size()); + this->total_num_graphs += ret; + return ret; +} + + +bool CompressedTrie::appendRecursive(key_tp* word, uint32_t len) +{ + Node* currNode = this->root; + + + while(true) + { + key_tp currFirstLetter = word[0]; + + // No branch exists for our word + if(currNode->next_map.find(currFirstLetter) == currNode->next_map.end()) + { + // append new node with entire word + Node* appended = this->createNode(word, len); + currNode->next_map[currFirstLetter] = appended; + appended->is_graph = true; + return true; + } + + Node* cand = currNode->next_map[currFirstLetter]; + + // Find an index at which graph and word divert + uint32_t moveIndex = this->compareValues(cand, word, len); + + // If node isnt fully in our word -> divide aligned and unaligned part + if(moveIndex < cand->num_values) + { + subdivideNode(cand, moveIndex); + } + // If node is our entire word -> mark and end + if(moveIndex == len) + { + bool ret = cand->is_graph; + cand->is_graph = true; + return ret; + } + + // go deeper + currNode = cand; + word = word + moveIndex; + len = len - moveIndex; + } + + + + return false; +} + +#include <queue> + +void CompressedTrie::PrintTrie() +{ + std::queue<Node*> inorder_q; + inorder_q.push(this->root); + + std::cout << "#############################\n"; + + while(!inorder_q.empty()) + { + uint32_t size = inorder_q.size(); + for(int i = 0; i < size; i++) + { + Node* n = inorder_q.front(); + inorder_q.pop(); + n->print(); + + for(const auto& value : n->next_map) + { + inorder_q.push(value.second); + } + } + std::cout << "\n"; + } +} + +void CompressedTrie::subdivideNode(Node* node, uint32_t divideIndex) +{ + uint32_t prevLen = node->num_values; + if(prevLen == divideIndex) return; + + + node->num_values = divideIndex; + + Node* newNode = this->node_pool.get(); + newNode->values = node->values + divideIndex; + newNode->num_values = prevLen - divideIndex; + newNode->next_map = node->next_map; + newNode->is_graph = node->is_graph; + + node->next_map = {}; + node->next_map[newNode->values[0]] = newNode; + node->is_graph = false; + + return; +} + +uint32_t CompressedTrie::compareValues(Node* toAppend, key_tp* word, uint32_t len) +{ + uint32_t size = toAppend->num_values; + size = (size < len) ? size : len; + + for(uint32_t i = 1; i < size; i++) + { + if(word[i] != toAppend->values[i]) + { + return i; + } + } + return size; +} + +Node* CompressedTrie::createNode(key_tp* word, uint32_t len) +{ + Node* ret = this->node_pool.get(); + ret->num_values = len; + ret->values = new key_tp[len]; + std::memcpy(ret->values, (void*)word, len * sizeof(key_tp)); + + return ret; +} + +void CompressedTrie::appendWordToNode(Node* toAppend, key_tp* word, uint32_t len) +{ + key_tp* newValues = new key_tp[toAppend->num_values + len]; + std::memcpy(newValues, toAppend->values, toAppend->num_values * sizeof(key_tp)); + std::memcpy(newValues + toAppend->num_values, word, len * sizeof(key_tp)); + delete[] toAppend->values; + toAppend->values = newValues; + toAppend->num_values += len; +} + +void CompressedTrie::SampleAdjacencyMatrix(std::vector<LogicalMatrix*>& matrix) +{ + if(this->total_num_graphs == 0) return; + this->current_needed_index = this->SampleLong() % this->total_num_graphs; + this->current_needed_index++; + this->current_visited_index = 0; + this->current_G_adjs = &matrix; + + this->findRecursive(this->root); + + return; +} + +void CompressedTrie::findRecursive(Node *currNode) +{ + if (this->current_visited_index == this->current_needed_index) + { + return; + } + + for (const auto &item : currNode->next_map) + { + const auto &key = item.first; + const auto &value = item.second; + + G_forward(value, *this->current_G_adjs); + + this->current_visited_index += value->is_graph; + + this->findRecursive(value); + + if (this->current_visited_index == this->current_needed_index) + { + return; + } + + G_backward(value, *this->current_G_adjs); + } + + return; +} + +struct key_action_tp +{ + ChangeType type; + key_tp coord; + + void forward(std::vector<LogicalMatrix*>& adjs) + { + uint32_t listID = std::get<0>(coord); + uint32_t from = std::get<1>(coord); + uint32_t to = std::get<2>(coord); + + (*adjs[listID])(from, to) = type; + } +}; +// ########################################### COMPRESSED TRIE CODE ############################################################### + + +struct IData +{ + const List* PK; + const List* SK; + const IntegerVector* sizes; + uint32_t N; + const List* bge_param; + + Function* score_full_func; + Function* score_change_func; +}; +struct ISettings +{ + uint32_t I; + bool debug; + double beta_L; + double beta_H; + //double cutoff_ratio; +}; + +#define SAVE_FREQ 10000 +#define PRINT_FREQ 10000 +#define PARTIAL_ADD_ONLY "[pSK-SS][add only]" +#define PARTIAL_ADD_REMOVE "[pSK-SS][add-remove]" +#define SKSS_PHASE "[SK-SS][add-remove]" + +class Network +{ +protected: + IData input_data; + ISettings settings; + CompressedTrie V_G; + + /** + * sort individual elements from lowest to largest by unordered_set + * this will be used to push to TRIE -> count subgraphs + */ + std::vector<key_tp> ord_V_G; + + + /** + * store the current adjacency graph and the best MAP model found during the search + * > update only when better model is found + */ + std::vector<LogicalMatrix*> G; + std::vector<LogicalMatrix*> G_sampled; + + std::vector<double> acceptedScores; + std::vector<double> acceptedScoresPrior; + + uint32_t current_id = 0; + + double probabilityMaxOptim = -99999999999.0; + std::vector<LogicalMatrix*> G_optim; + + +public: + Network(IData input, ISettings settings) + { + this->input_data = input; + this->settings = settings; + + this->initializeLists(); + } + + ~Network() + { + this->deleteLists(); + } + + void partialSK_SS() + { + for(uint32_t i = 0; i < this->settings.I; i++) + { + Rcout << "partialSK-SS, Iteration: " << i << "\n"; + this->emptyCurrCountsAndG(); + this->partialSK_SS_iter(); + this->ord_V_G.clear(); + } + } + + void SK_SS() + { + this->partialSK_SS(); + + for(uint32_t i = 0; i < this->settings.I; i++) + { + Rcout << "SK-SS, Iteration: " << i << "\n"; + this->resetSampledG(); + this->V_G.SampleAdjacencyMatrix(this->G_sampled); + + this->emptyCurrCountsAndG(); + this->SK_SS_phase2_iter(); + } + + Rprintf("End of SK_SS() function here\n"); + } + + List& result() + { + Rprintf("Optimum log probability: %.10f\n", this->probabilityMaxOptim); + + const List* SK_list = this->input_data.SK; + List* ret = new List(SK_list->length()); + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + (*ret)[i] = *this->G_optim[i]; + } + + return *ret; + } + +protected: + void SK_SS_phase2_iter() + { + Rprintf("phase 2\n"); + List twoScores = this->callScoreFull(); + const List* SK_list = this->input_data.SK; + + IntegerVector riter = Rcpp::sample( + this->input_data.N, (this->input_data.N)); + + saveRDS(riter, Rcpp::Named("file", "sampled_indices.RData")); + + + double bge = twoScores["pDG"]; + double energy = twoScores["pG"]; + double probabilityMovedG = bge + energy; + + if(probabilityMovedG > this->probabilityMaxOptim) + { + Rcout << "YOU IDIOT, YOU HAVE BROKEN SAMPLING!\n"; + Rprintf("%.9f x %.9f\n", (probabilityMovedG), this->probabilityMaxOptim); + } + + + for(uint32_t i = 0; i < riter.size(); i++) + { + this->current_id = i; + + uint32_t flatindex = riter[i] - 1; + uint32_t lindex = 0; + uint32_t xyindex = flatindex; + + uint32_t sumSizes = 0; + while(true) + { + uint32_t sizeMatrix = (*this->input_data.sizes)[lindex] * (*this->input_data.sizes)[lindex + 1]; + sumSizes += sizeMatrix; + + if(sumSizes > flatindex) + { + break; + } + lindex++; + xyindex -= sizeMatrix; + } + + const IntegerMatrix& sk = (*SK_list)[lindex]; + uint32_t r = xyindex % sk.nrow(); + uint32_t c = xyindex / sk.nrow(); + + if(i % PRINT_FREQ == 0) // i % PRINT_FREQ == 0 + { + this->DebugInfo(SKSS_PHASE, i); + } + + if(i % SAVE_FREQ == 0) + { + //this->SaveGraph(); + } + + // if inside G_0 -> skip + if((*this->G[lindex])(r,c) != 0) + continue; + + // try add an edge outside of G_0 + // do not update V_G since we want to sample from partialSK-SS only! + // because of that -> shouldUpdateTrie = false + this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::ADD, false); + + + } + Rprintf("end of phase 2\n"); + } + + + void partialSK_SS_iter() + { + List twoScores = this->callScoreFull(); + const List* SK_list = this->input_data.SK; + const List* PK_list = this->input_data.PK; + + IntegerVector riter = Rcpp::sample( + this->input_data.N, this->input_data.N); + //saveRDS(riter, Rcpp::Named("file", "sampled_indices.RData")); + Rprintf("partialSKSS-add-only phase\n"); + + + for(uint32_t i = 0; i < this->input_data.N; i++) + { + this->current_id = i; + + uint32_t flatindex = riter[i] - 1; + uint32_t lindex = 0; + uint32_t xyindex = flatindex; + + uint32_t sumSizes = 0; + while(true) + { + uint32_t sizeMatrix = (*this->input_data.sizes)[lindex] * (*this->input_data.sizes)[lindex + 1]; + + sumSizes += sizeMatrix; + + if(sumSizes > flatindex) + { + break; + } + lindex++; + xyindex -= sizeMatrix; + } + + const IntegerMatrix& sk = (*SK_list)[lindex]; + const IntegerMatrix& pk = (*PK_list)[lindex]; + uint32_t r = xyindex % sk.nrow(); + uint32_t c = xyindex / sk.nrow(); + + if(i % (PRINT_FREQ * 1000) == 0) + { + this->DebugInfo(PARTIAL_ADD_ONLY, i); + } + // if((sk(r, c) == 0) && + // (pk(r, c) != 1)) + if((sk(r, c) == 0)) + { + continue; + } + if(i % SAVE_FREQ == 0) + //this->SaveGraph(); + + this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::ADD); + + } + + Rprintf("partialSKSS-add-remove phase\n"); + riter = Rcpp::sample( + this->input_data.N, this->input_data.N); + + for(uint32_t i = 0; i < this->input_data.N; i++) + { + uint32_t flatindex = riter[i] - 1; + uint32_t lindex = 0; + uint32_t xyindex = flatindex; + + uint32_t sumSizes = 0; + while(true) + { + uint32_t sizeMatrix = (*this->input_data.sizes)[lindex] * (*this->input_data.sizes)[lindex + 1]; + + sumSizes += sizeMatrix; + + if(sumSizes > flatindex) + { + break; + } + lindex++; + xyindex -= sizeMatrix; + } + + const IntegerMatrix& sk = (*SK_list)[lindex]; + const IntegerMatrix& pk = (*PK_list)[lindex]; + uint32_t r = xyindex % sk.nrow(); + uint32_t c = xyindex / sk.nrow(); + + if(i % (PRINT_FREQ * 1000) == 0) + { + this->DebugInfo(PARTIAL_ADD_REMOVE, i); + } + if(i % SAVE_FREQ == 0) + //this->SaveGraph(); + + // if((sk(r, c) == 0) && + // (pk(r, c) != 1)) + if((sk(r, c) == 0)) + { + continue; + } + + LogicalMatrix& gz = *(this->G[lindex]); + + if(gz(r,c) == 0) // if not in G -> try add + this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::ADD); + else // if in G -> try delete + this->tryPerformMove(&twoScores, lindex, r, c, ChangeType::DELETE); + } + } + + void DebugInfo(std::string phase, uint32_t iter) + { + Rprintf("%s[%5u]/[%5u][Trie:%5u|%5u][Score:%9.6f]\n", + phase.c_str(), iter, this->input_data.N, + this->V_G.num_graphs(), this->V_G.num_nodes(), + this->probabilityMaxOptim); + } + + List& G_to_list() + { + const List* SK_list = this->input_data.SK; + List* ret = new List(SK_list->length()); + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + (*ret)[i] = *this->G[i]; + } + + return *ret; + } + + List& G_optim_to_list() + { + const List* SK_list = this->input_data.SK; + List* ret = new List(SK_list->length()); + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + (*ret)[i] = *this->G_optim[i]; + } + + return *ret; + } + + void resetSampledG() + { + const List* SK_list = this->input_data.SK; + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + const IntegerMatrix& sk = (*SK_list)[i]; + + *(this->G_sampled)[i] = LogicalMatrix(sk.nrow(), sk.ncol()); + Rcpp::rownames(*(this->G_sampled)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G_sampled)[i]) = Rcpp::colnames(sk); + } + } + + void emptyCurrCountsAndG() + { + const List* SK_list = this->input_data.SK; + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + const IntegerMatrix& sk = (*SK_list)[i]; + + *(this->G)[i] = Rcpp::LogicalMatrix( + Rcpp::clone(*this->G_sampled[i]) + ); + Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); + } + } + + /** + * We should convert Rcpp lists to vectors of matrices because: + * > we can't easily manipulate Rcpp List objects + * > with matrix pointers it is easier to manage memory manually + * > BUT! we should still use matrices to call R score functions + */ + void initializeLists() + { + const List* SK_list = this->input_data.SK; + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + const IntegerMatrix& sk = (*SK_list)[i]; + + (this->G).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); + + (this->G_optim).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->G_optim)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G_optim)[i]) = Rcpp::colnames(sk); + + (this->G_sampled).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->G_sampled)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G_sampled)[i]) = Rcpp::colnames(sk); + } + + } + + /** + * Clear matrices in vectors + */ + void deleteLists() + { + const List* SK_list = this->input_data.SK; + for(uint32_t i = 0; i < SK_list->length(); i++) + { + delete this->G[i]; + delete this->G_optim[i]; + } + + this->G.clear(); + this->G_optim.clear(); + } + + + void tryPerformMove(List* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action, + bool shouldUpdateTrie = true) + { + // Compute the possible score + List scoreChanges = this->callScoreChange(score, listID, from, to, action); + + /** + * Modified(explicit per-partes) is used, e.g.: + * + * since we are working in small precisions + log scale: + * > difference is calculated in SCORE functions manually + * > difference is separately calculated for BGe / Energy part + * > the previous is because they have orders of magnitude differences + * > ONLY afterwards the summed difference is converted by exp, since it is tractable! + */ + double prior_diff = scoreChanges["pG_change"]; + double bge_diff = scoreChanges["pDG_change"]; + double mod_diff = (prior_diff + bge_diff); + double diff = exp(mod_diff); + + // Convert the log difference to B/A and normalize it + // normalize(1, B/A) = normalize(A,B) + // that way we get probabilities for two candidates! + NumericVector probs = {1.0, diff}; + probs = probs / Rcpp::sum(probs); + IntegerVector ids = {0, 1}; + IntegerVector ret = Rcpp::sample(ids, 1, probs = probs); + + // If the first was sampled, skip + if(ret[0] != 1) return; + + // Update score + *score = scoreChanges; + + // Update G if not in SK-SS part + this->makeMove(listID, from, to, action, shouldUpdateTrie); + + // Update optimum + this->CheckVisitedGraphOptimum(scoreChanges); + } + + + + void makeMove(uint32_t listID, uint32_t from, uint32_t to, ChangeType action, bool shouldUpdateTrie) + { + // update adjacency + (*(this->G[listID]))(from,to) = action == ChangeType::ADD; + + + // If SK-SS part -> don't update Trie + if(!shouldUpdateTrie) + { + return; + } + + // Update our ordered add-only word + key_tp kt = std::make_tuple((uint8_t)listID, (uint16_t)from, (uint16_t)to); + if(action == ChangeType::ADD) + { + insert_sorted(this->ord_V_G, kt); + } + else{ + erase_sorted(this->ord_V_G, kt); + } + + // Try to add word to Trie + this->V_G.AppendChain(this->ord_V_G); + } + + void SaveGraph() + { + // Save intermediate results + Rprintf("Saving intermediate results [%9.6f]\n", this->probabilityMaxOptim); + + // Save optimum + List& tmp_adjs = this->G_optim_to_list(); + char fileName[50]; + char fileName2[50]; + sprintf(fileName, "optimG_%9.6f.RData", this->probabilityMaxOptim); + sprintf(fileName2, "optimG_%9.6f_copy.RData", this->probabilityMaxOptim); + //saveRDS(tmp_adjs, Rcpp::Named("file", fileName)); + //saveRDS(tmp_adjs, Rcpp::Named("file", fileName2)); + + // Save current + tmp_adjs = this->G_to_list(); + List ret = List(2); + ret[0] = this->current_id; + ret[1] = tmp_adjs; + //saveRDS(ret, Rcpp::Named("file", "G_last.RData")); + + // Save current + + char fileName3[50]; + tmp_adjs = this->G_to_list(); + sprintf(fileName3, "G_%d.RData", this->current_id); + //saveRDS(ret, Rcpp::Named("file", fileName3)); + + // Save visited scores and reset them + ret = List(2); + ret[0] = Rcpp::wrap(acceptedScores); + ret[1] = Rcpp::wrap(acceptedScoresPrior); + //saveRDS(ret, Rcpp::Named("file", "accepted_scores.RData")); + //saveRDS(ret, Rcpp::Named("file", "accepted_scores_copy.RData")); + } + + void CheckVisitedGraphOptimum(List& scoreChanges) + { + double energy = scoreChanges["pG"]; + double bge = scoreChanges["pDG"]; + + + double probabilityMovedG = bge + energy; + + this->acceptedScores.push_back(bge); + this->acceptedScoresPrior.push_back(energy); + + if(probabilityMovedG <= this->probabilityMaxOptim) + { + return; + } + + //Rcout << "New optim found! " << this->probabilityMaxOptim << "x" << probabilityMovedG << "\n"; + //Rcout << sum(NumericMatrix(*this->G_optim[0])) << " x " << sum(NumericMatrix(*this->G[0])) << "\n"; + this->probabilityMaxOptim = probabilityMovedG; + + const List* SK_list = this->input_data.SK; + for(uint32_t i = 0; i < SK_list->length(); i++) + { + *this->G_optim[i] = Rcpp::LogicalMatrix( + Rcpp::clone(*this->G[i]) + ); + } + } + + + + List callScoreChange(List* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) + { + List& adjs = this->G_to_list(); + List scoreChanges = (*this->input_data.score_change_func) + ( + adjs, + *this->input_data.PK, + this->settings.beta_L, + this->settings.beta_H, + *this->input_data.bge_param, + *score, + listID, + from, + to, + action == ChangeType::ADD); + + delete &adjs; + + return scoreChanges; + } + + List callScoreFull() + { + List& adjs = this->G_to_list(); + List twoScores = (*this->input_data.score_full_func)( + adjs, + *this->input_data.PK, + this->settings.beta_L, + this->settings.beta_H, + *this->input_data.bge_param + ); + delete &adjs; + return twoScores; + } +}; + +/** + * This function is the interface to the R part, the only function exported + */ +List c_SK_SS( + const List& PK, + const List& SK, + const IntegerVector& sizes, + const List& param, + uint32_t I, + bool debug, + double beta_L, double beta_H, + Function score_full_func, + Function score_change_func); + + +/** + * This part is the implementation of above header declarations, it goes to .CPP part in package + */ +//#include "skeleton.h" + +// [[Rcpp::export]] +List c_SK_SS( + const List& PK, + const List& SK, + const IntegerVector& sizes, + const List& param, + uint32_t I, + bool debug, + double beta_L, double beta_H, + Function score_full_func, + Function score_change_func +) +{ + uint32_t N = 0; + for(uint32_t i = 0; i < (sizes.size() - 1); i++) + { + N += sizes[i] * sizes[i+1]; + } + Rprintf("N = %u\n", N); + + IData i_data; + + i_data.PK = &PK; + i_data.SK = &SK; + i_data.sizes = &sizes; + i_data.bge_param = ¶m; + i_data.N = N; + + i_data.score_full_func = &score_full_func; + i_data.score_change_func = &score_change_func; + + ISettings settings; + + settings.I = I; + settings.debug = debug; + settings.beta_H = beta_H; + settings.beta_L = beta_L; + + Network* net = new Network(i_data, settings); + + net->SK_SS(); + List ret = net->result(); + delete net; + + return ret; +} diff --git a/source/synthetic.R b/source/synthetic.R new file mode 100644 index 0000000000000000000000000000000000000000..34145ff797e3daf5128791015bcd8ae692a2c565 --- /dev/null +++ b/source/synthetic.R @@ -0,0 +1,467 @@ +my.ecdf <- function(x) { + x <- sort(x) + x.u <- unique(x) + n <- length(x) + x.rle <- rle(x)$lengths + y <- (cumsum(x.rle)-0.5) / n + FUN <- approxfun(x.u, y, method="linear", yleft=0, yright=1, + rule=2) + FUN +} +KL_est <- function(x, y) { + dx <- diff(sort(unique(x))) + dy <- diff(sort(unique(y))) + ex <- min(dx) ; ey <- min(dy) + e <- min(ex, ey)/2 + n <- length(x) + P <- my.ecdf(x) ; Q <- my.ecdf(y) + KL <- sum( log( (P(x)-P(x-e))/(Q(x)-Q(x-e)))) / n + KL +} + +softmax <- function(v) +{ + ret <- exp(v) + ret <- ret / sum(ret) + return(ret) +} + +compare_methods <- function(input_type="sigmoid", only_allowed=FALSE, add_random_reference=FALSE) +{ + library(caret) + library(ggplot2) + library(stringr) + library(stringi) + library(RColorBrewer) + library(ggnewscale) + library(ggrepel) + + if(input_type == "sigmoid") + { + folder <- "~/Desktop/pim-sk-ss/results/MAP/" + } + else{ + folder <- "~/Desktop/pim-sk-ss/results/SK/SK_algos/" + } + + data_fname <- "data.RDS" + data <- readRDS("~/Desktop/pim-sk-ss/results/MAP/data.RDS") + parts <- data$partition + to_test <- list.files(folder) + + #myColors <- brewer.pal(10, "Paired") + myColors <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", + "blue", "#D55E00", "#CC79A7", "red", "green", "purple", + "cyan") + + myColors <- rep(myColors, 10) + + mets <- c("cor", "mc-cor", "smc-cor", "zf", "mc-zf", "smc-zf", "mi-g", "mc-mi-g", "smc-mi-g", "mi-g-sh") + + if(only_allowed) + { + real <- c(data$Real_adjacency[parts[[1]], parts[[2]]], + data$Real_adjacency[parts[[2]], parts[[3]]]) + } + else{ + real <- unlist(c(data$Real_adjacency)) + } + + real <- factor( + real, + levels = c(0, 1)) + + df_res <- data.frame(matrix(0, nrow=0, ncol=7)) + for(fname in to_test) + { + if(fname == data_fname) + { + next + } + + adj <- readRDS(paste(folder, fname, sep="")) + #SK <- amat(adj) + SK <- adj + if(nrow(SK) != length(unlist(data$partition))) + { + N <- length(unlist(data$partition)) + SK_correct <- matrix(0, N, N) + rownames(SK_correct) <- unlist(data$partition) + colnames(SK_correct) <- unlist(data$partition) + SK_correct[rownames(SK), colnames(SK)] <- SK + SK_correct <- data.frame(SK_correct) + SK <- SK_correct + } + + + if(only_allowed) + { + pred <- c(unlist(SK[parts[[1]], parts[[2]]]), unlist(SK[parts[[2]], parts[[3]]])) + } + else{ + pred <- unlist(c(SK)) + } + pred <- factor( + pred, + levels = c(0, 1)) + if(length(pred) != length(real)) + { + asd <- 1 + } + confMat <- caret::confusionMatrix(pred,real) + #TPR <- confMat$table[2,2] / sum(confMat$table[,2]) + #FPR <- confMat$table[2,1] / sum(confMat$table[,1]) + TPR <- confMat$table[1,2] + FPR <- confMat$table[2,1] + + acc <- (confMat$table[1,1] + confMat$table[2,2]) / sum(c(confMat$table)) + acc <- confMat$table[1,2] + confMat$table[2,1] + + fname_combo <- strsplit(fname, ".RDS")[[1]] + algo <- stri_split_fixed(fname_combo, "-", 2, simplify = TRUE)[1] + #met <- stri_split_fixed(fname_combo, "-", 2, simplify = TRUE)[2] + + met <- stri_split_fixed(fname_combo, "_", 2, simplify = TRUE)[2] + algo <- stri_split_fixed(fname_combo, "_", 2, simplify = TRUE)[1] + #acc <- met + num_iter <- stri_split_fixed(met, "_", 2, simplify = TRUE)[1] + num_samples <- stri_split_fixed(met, "_", 2, simplify = TRUE)[2] + + df_res <- rbind(df_res, c(TPR, FPR,acc,algo,met, num_iter, num_samples)) + } + if(FALSE)#add_random_reference) + { + pred <- sample(c(0,1), length(real), replace = TRUE) + pred <- factor( + pred, + levels = c(0, 1)) + confMat <- caret::confusionMatrix(pred,real) + TPR <- confMat$table[2,2] / sum(confMat$table[,2]) + FPR <- confMat$table[2,1] / sum(confMat$table[,1]) + acc <- (confMat$table[1,1] + confMat$table[2,2]) / sum(c(confMat$table)) + acc <- confMat$table[1,2] + confMat$table[2,1] + + if(acc > 50) + { + next + } + + + algo <- "random" + met <- "random" + df_res <- rbind(df_res, c(TPR, FPR,acc,algo,met)) + } + + + + df_res[,1:2] <- sapply(df_res[, 1:2], as.numeric) + df_res[,6:7] <- sapply(df_res[, 6:7], as.numeric) + colnames(df_res) <- c("TPR", "FPR", "accuracy", + "algorithm", "method", "num_iter", "num_samples") + + df_res$SHD <- df_res$TPR + df_res$FPR + df_res <- df_res[order(df_res$SHD),] + df_res <<- df_res + + if(only_allowed) + { + title_name <- "ROC graph with allowed edges only" + } + else{ + title_name <- "ROC graph with all edges(even forbidden interactions)" + } + + ggplot(data=df_res, aes(x = FPR, y = TPR, color=algorithm)) + + geom_point() + + #geom_text(aes(label=algorithm), hjust=0, vjust=0) + + xlab("False Positive Rate") + ylab("True Positive Rate") + + #scale_colour_manual(name = "Scoring metric", values = myColors) + + ggnewscale::new_scale_color() + + geom_text(aes(x = FPR, y = TPR, label=SHD, + #label=sprintf("%0.2d", round(accuracy, digits = 2)), + color="-" + ), + hjust=0, vjust=-1) + + scale_colour_manual(name = "Accuracy", values = c("black")) + + ggtitle(label=title_name)# + + #ggplot2::xlim(0, 50) #+ + #ggplot2::ylim(0, 125) + +} + + + + +find_closest <- function(GE) +{ + library(philentropy) + library(transport) + + tst <- GE + tst <- unlist(tst) + storage.mode(tst) <- "numeric" + #tst <- tst / 10^6 + #tst <- log(tst / (1 - tst)) + + #tst <- tst[tst > -50] + #tst <- tst[tst < 50] + dataOrig <- c(tst) + dataOrig <- unlist(dataOrig) + + #h_tst <- hist(tst, plot=FALSE, xlim = c(-50, 50)) + #h_tst$counts=h_tst$counts/sum(h_tst$counts) + + pwr <- seq(-3,2,1) + grd <- c( + -(10^pwr), + 0, + 10^pwr + ) + grd_pos <- c( + 10^pwr + ) + + all_combos <- expand.grid( + grd_pos, + grd, + grd_pos, + grd_pos, + grd + ) + i <- 1 + + best_params <- NA + best_score <- 999999 + + values <- apply(all_combos, 1, function(rw) + { + dist <- generate_data(epsilon_std=rw[1], beta_mu=rw[2], beta_std=rw[3], beta_zero_std=rw[4], beta_zero_mu = rw[5]) + dist2 <- generate_data(epsilon_std=rw[1], beta_mu=rw[2], beta_std=rw[3], beta_zero_std=rw[4], beta_zero_mu = rw[5]) + dist3 <- generate_data(epsilon_std=rw[1], beta_mu=rw[2], beta_std=rw[3], beta_zero_std=rw[4], beta_zero_mu = rw[5]) + + dataDist <- c(dist$Data, dist2$Data, dist3$Data) + dataDist <- unlist(dataDist) + + #rkl <- ks.test(c(dist$Data), tst) + rkl <- wasserstein1d(dataDist, dataOrig) + #x <- rbind(h_tst$counts, h$counts) + #rkl <- KL_est(tst, c(dist$Data)) + #rkl <- KL(x, unit='log2') + cat("i=",i,"/",nrow(all_combos), "rkl=", rkl, "\n") + + if(rkl < best_score) + { + best_score <<- rkl + best_params <<- rw + saveRDS(list(score=best_score, params=best_params), "results/best_params.RData") + } + + i <<- i+1 + + if(is.infinite(rkl)) + { + return(rkl) + } + + return(rkl) + }) + return(values) +} + + +.generate_manual <- function(N_samples, G, partitions, + epsilon_std, beta_mu, beta_std, beta_zero_mu, beta_zero_std) +{ + N <- ncol(G) + names <- unlist(partitions) + + # First -- generate all parameters of Gaussian network + params <- sapply(1:N, function(i) + { + T_i <- sum(G[,i] == 1) # num parents + beta_zero <- rnorm(1, mean=beta_zero_mu, sd=beta_zero_std) + #beta_zero <- beta_zero * sample(c(0,1), 1, prob=c(0.005, 0.995)) + #beta_zero <- rnorm(1, mean=beta_zero_mu, sd=beta_zero_std) + #beta_zero <- rbeta(1, 8, 2) * beta_zero_mu + + + mean_beta <- rnorm(T_i, mean=beta_mu, sd=beta_std) + betas <- sapply(mean_beta, function(m) + rnorm(1, mean=m, sd=beta_std) + #rpois(1, lambda = mean_beta^2) + ) + betas <- unlist(betas) + betas <- betas^2 + + betas <- c(beta_zero, betas) + # if(T_i > 0) + # { + # m <- rnorm(1, mean=beta_mu, sd=beta_std) + # beta_0_multiplier <- rnorm(1, mean=m, sd=beta_std) + # normalize_coefs <- softmax(c(beta_0_multiplier, betas)) + # betas <- normalize_coefs + # betas[1] <- betas[1] * beta_zero + # }else{ + # betas <- c(beta_zero, betas) + # } + + + + return(c(betas)) + }) + names(params) <- names + + # Second -- generate synthetic normalized data according to params + norm_data <- sapply(1:N_samples, function(x) + { + X <- lapply(params, function(v) + v[1] + rnorm(1, mean=0, sd=epsilon_std)) # beta_0 + epsilon + X <- unlist(X) + names(X) <- names + + # miRNA iteration + expr <- G * X # take expr values without any parents + expr_mi <- expr[,c(partitions[[2]])] + expr_mi <- apply(expr_mi, 2, function(cl) c(0, cl[cl != 0])) + param_mi <- params[c(partitions[[2]])] + X_mi <- Map('*', expr_mi, param_mi) + X_mi <- lapply(X_mi, sum) + X[c(partitions[[2]])] <- X[c(partitions[[2]])] + unlist(X_mi) + # mRNA iteration + expr <- G * X # take expr values without any parents + expr_mrna <- expr[,c(partitions[[3]])] + expr_mrna <- apply(expr_mrna, 2, function(cl) c(0, cl[cl != 0])) + param_mrna <- params[c(partitions[[3]])] + X_mrna <- Map('*', expr_mrna, param_mrna) + X_mrna <- lapply(X_mrna, sum) + X[c(partitions[[3]])] <- X[c(partitions[[3]])] + unlist(X_mrna) + + return(X) + }) + norm_data <- base::t(norm_data) + storage.mode(norm_data) <- "numeric" # so that we can perform operations + + # Lastly -- un-normalize the data according to the TPM procedure + # First -- convert to 0,1 range by sigmoid ?? + # Next -- convert to TPM counts by *10^6 + Data <- norm_data + #Data <- norm_data + colnames(Data) <- names + + return(list(Data=Data, params=params)) +} + +#' +#' @param N_samples A number of samples to be generated +#' @param n_circ A number of circRNA nodes to be generated +#' @param n_mirna A number of miRNA nodes to be generated +#' @param n_mrna A number of mRNA nodes to be generated +#' @param p_rand A fraction of all possible edges to be included in synthetic dataset +#' @param TPM_f_circ A TPM count factor for circRNA count multiplication +#' @param TPM_f_mirna A TPM count factor for miRNA count multiplication +#' @param TPM_f_mrna A TPM count factor for mRNA count multiplication +#' @param prior_FN A fraction of included edges to be removed from the prior knowledge +#' @param prior_FP A fraction of prior knowledge edges to be added by the false edges +#' @param type A type of output needed.\\ +#' "gaussian" = just the output of gaussian network\\ +#' "sigmoid" = sigmoid-applied probabilities in (0,1) range +#' +#' +#' @returns A list consisting of multiple data. +#' +#' Data: a data matrix of size (N_samples, n_circ + n_mirna + n_mrna) consisting of synthetic counts +#' Real_adjacency: an adjacency matrix of a real underlying network +#' PK_adjacency: a given prior knowledge adjacency matrix that contains part of real edges and a fraction of false edges +#' partitions: a list of circRNA, miRNA, mRNA node name vectors +#' + +generate_data <- function(N_samples=100, + n_circ = 5, n_mirna = 20, n_mrna = 100, + p_rand = 0.05, + prior_FP = 0.20, + prior_FN = 0.20, + epsilon_std = 0.1, + beta_zero_mu=-11.2, + beta_zero_std=0.05, + beta_mu=0.0, # these parameters simulate the range of values: (0, 19) with mean=13 of TPM MDS data + beta_std=0.2, # 0.2 + type="gaussian" +) +{ + N <- n_circ + n_mirna + n_mrna + names_circ <- paste("circ.", 1:n_circ, sep="") + names_mirna <- paste("mirna.", 1:n_mirna, sep="") + names_mrna <- paste("mrna.", 1:n_mrna, sep="") + names <- c(names_circ, names_mirna, names_mrna) + partitions <- list(circRNA=names_circ, miRNA=names_mirna, mRNA=names_mrna) + + Real_adjacency <- matrix(0, N, N) + # randomly select indices (Question: maybe more convenient way is to sample equally across all genes) + # CircRNA - miRNA + mat_circ_mi <- matrix(0, n_circ, n_mirna) + N_circ_mi <- n_circ * n_mirna + circ_mi_indices <- sample(N_circ_mi, N_circ_mi * p_rand) + mat_circ_mi[circ_mi_indices] <- 1 + Real_adjacency[1:n_circ, (n_circ + 1):(n_circ + n_mirna)] <- mat_circ_mi + # miRNA - mRNA + mat_mi_m <- matrix(0, n_mirna, n_mrna) + N_mi_m <- n_mirna * n_mrna + mi_m_indices <- sample(N_mi_m, N_mi_m * p_rand) + mat_mi_m[mi_m_indices] <- 1 + Real_adjacency[(n_circ + 1):(n_circ + n_mirna), (n_circ + n_mirna + 1):(N)] <- mat_mi_m + # Set fake names + colnames(Real_adjacency) <- names + rownames(Real_adjacency) <- names + + # Generate synthetic data samples + gen_data <- .generate_manual(N_samples = N_samples, + G=Real_adjacency, + partitions=partitions, + epsilon_std = epsilon_std, + beta_mu = beta_mu, + beta_std = beta_std, + beta_zero_mu=beta_zero_mu, + beta_zero_std=beta_zero_std + ) + + if(type == "sigmoid") + { + gen_data$Data <- 10^6 / (1 + exp(-gen_data$Data)) + } + + Data <- gen_data$Data + Gaussian_params <- gen_data$params + + # Simulate prior knowledge + PK_adjacency <- Real_adjacency + true_indices <- which(PK_adjacency == 1) + to_remove <- sample(true_indices, length(true_indices) * prior_FN) + + + PK_circ_mi <- PK_adjacency[partitions$circRNA, partitions$miRNA] + PK_mi_m <- PK_adjacency[partitions$miRNA, partitions$mRNA] + + false_indices <- which(PK_circ_mi == 0) + init_indices <- which(PK_circ_mi == 1) + to_add <- sample(false_indices, length(init_indices) * prior_FP) + PK_circ_mi[to_add] <- 1 + + false_indices <- which(PK_mi_m == 0) + init_indices <- which(PK_mi_m == 1) + to_add <- sample(false_indices, length(init_indices) * prior_FP) + PK_mi_m[to_add] <- 1 + + PK_adjacency[partitions$circRNA, partitions$miRNA] <- PK_circ_mi + PK_adjacency[partitions$miRNA, partitions$mRNA] <- PK_mi_m + PK_adjacency[to_remove] <- 0 + + + # Return list with all output + ret <- list() + ret$Data <- data.frame(Data) + ret$Real_adjacency <- Real_adjacency + ret$PK_adjacency <- PK_adjacency + ret$partition <- partitions + ret$Gaussian_params <- Gaussian_params + return(ret) +} \ No newline at end of file diff --git a/visualization.R b/visualization.R new file mode 100644 index 0000000000000000000000000000000000000000..610220989022740d141b37be9d6ce9a2f5e02d04 --- /dev/null +++ b/visualization.R @@ -0,0 +1,248 @@ +draw_confusion_matrix <- function(cm, title_name='CONFUSION MATRIX') { + + layout(matrix(c(1,1,2))) + par(mar=c(2,2,2,2)) + plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n') + title(title_name, cex.main=2) + + # create the matrix + rect(150, 430, 240, 370, col='#3F97D0') + text(195, 435, 'Class1', cex=1.2) + rect(250, 430, 340, 370, col='#F7AD50') + text(295, 435, 'Class2', cex=1.2) + text(125, 370, 'Predicted', cex=1.3, srt=90, font=2) + text(245, 450, 'Actual', cex=1.3, font=2) + rect(150, 305, 240, 365, col='#F7AD50') + rect(250, 305, 340, 365, col='#3F97D0') + text(140, 400, 'Class1', cex=1.2, srt=90) + text(140, 335, 'Class2', cex=1.2, srt=90) + + # add in the cm results + res <- as.numeric(cm$table) + text(195, 400, res[1], cex=1.6, font=2, col='white') + text(195, 335, res[2], cex=1.6, font=2, col='white') + text(295, 400, res[3], cex=1.6, font=2, col='white') + text(295, 335, res[4], cex=1.6, font=2, col='white') + + # add in the specifics + plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n') + text(10, 85, names(cm$byClass[1]), cex=1.2, font=2) + text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2) + text(30, 85, names(cm$byClass[2]), cex=1.2, font=2) + text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2) + text(50, 85, names(cm$byClass[5]), cex=1.2, font=2) + text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2) + text(70, 85, names(cm$byClass[6]), cex=1.2, font=2) + text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2) + text(90, 85, names(cm$byClass[7]), cex=1.2, font=2) + text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2) + + # add in the accuracy information + text(30, 35, names(cm$overall[1]), cex=1.5, font=2) + text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4) + text(70, 35, names(cm$overall[2]), cex=1.5, font=2) + text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4) +} +check_test <- function(idx) +{ + flat_index <- indices[idx] - 1 + lindex <- as.integer(0) + xyindex <- flat_index + sumSizes <- as.integer(0) + while(TRUE) + { + sizeMatrix <- sizes[lindex + 1] * sizes[lindex + 1 + 1]; + + sumSizes <- sumSizes + sizeMatrix; + + if(sumSizes > flat_index) + { + break; + } + lindex <- lindex + 1; + xyindex <- xyindex - sizeMatrix; + } + + x <- xyindex %% sizes[lindex + 1] + y <- xyindex / sizes[lindex + 1] + y <- as.integer(y) + + if((x >= sizes[lindex + 1]) || + (y >= sizes[lindex + 2])) + { + printf("sizeMatrix: %d\n", sizeMatrix) + printf("idx: %d || %d = %d: [%d, %d, %d]\n", idx, flat_index, xyindex, lindex, x, y) + printf("ERROR!!!!\n") + } +} + +library(HilbertVis) +library(ggplot2) + +normalize <- function(arr, to_test=NA) +{ + mn = min(arr) + mx = max(arr - mn) + + if(!is.na(to_test)) + { + return((to_test - mn) / mx) + } + arr = arr - mn + arr = arr / mx + return(arr) +} +-8643899 # Marginal / optimB if structure penalty is given -- worse than majority of the +-5386733 # Marginal / optimB if not structure penalty is given -- better than any maximum +-3182.317 +-3310.715 +plot_vec <- function(sc_in, f, col, only_start=FALSE, should_normalize=FALSE, + initial_model=-3506.801) +{ + if(class(sc_in) == "list") + { + sc_bge <- sc_in[[1]] + sc_prior <- sc_in[[2]] + sc_in <- sc_bge + sc_prior + } + else{ + sc_bge <- sc_in + sc_prior <- sc_in + } + + sc_nonnormal <- sc_in + sc_prior_nonnormal <- sc_prior + sc_initial <- initial_model + + if(should_normalize) + { + sc_initial <- normalize(sc_in, sc_initial) + sc_bge <- normalize(sc_bge) + sc_prior <- normalize(sc_prior) + sc_in <- normalize(sc_in) + } + + + if(only_start) + { + diff <- sc_in[-1] - sc_in[-length(sc_in)] + max_id <- which.min(diff) + if(max_id < 3000) + { + max_id <- 3000 + } + sc <- sc_in[1:(max_id+1000)] + + sc_bge <- sc_bge[1:(max_id+1000)] + sc_prior <- sc_prior[1:(max_id+1000)] + } + else + { + sc <- sc_in + } + + N <- length(sc) + job_id <- strsplit(f, "[.]+") + job_id <- unlist(job_id) + job_id <- job_id[1] + main_name <- paste("Score of job ", job_id, sep="") + + # Entire plot + if(length(sc) > 4000) + { + sc_df <- shrinkVector(sc, 4000, mode = "min") + sc_bge_df <- shrinkVector(sc_bge, 4000, mode = "min") + sc_prior_df <- shrinkVector(sc_prior, 4000, mode = "min") + } + else{ + sc_df <- sc + sc_bge_df <- sc_bge + sc_prior_df <- sc_prior + } + + + #xs <- 1:4000 * (length(sc)/4000) + xs <- 1:length(sc_df) + sc_df <- data.frame(x=xs, y=sc_df) + sc_bge_df <- data.frame(x=xs, y=sc_bge_df) + sc_prior_df <- data.frame(x=xs, y=sc_prior_df) + + + max_id <- which.max(sc) + point_name <- paste("Maximum=",sc_nonnormal[max_id]) + p_df <- data.frame(x=max_id/ length(sc), y=sc[max_id], label=point_name) + + min_id <- which.min(sc) + point_name2 <- paste("Minimum=",sc_nonnormal[min_id]) + p_df2 <- data.frame(x=min_id/ length(sc), y=sc[min_id], label=point_name2) + + + + max_id <- which.max(sc_prior) + point_name <- paste("Maximum=",sc_prior_nonnormal[max_id]) + if(!is.na(sc_prior[1])) + { + p_df3 <- data.frame(x=max_id, y=sc_prior[max_id], label=point_name) + + min_id <- which.min(sc_prior) + point_name2 <- paste("Minimum=",sc_prior_nonnormal[min_id]) + p_df4 <- data.frame(x=min_id, y=sc_prior[min_id], label=point_name2) + } + + #checkpoints <- 21 * 10000 + #checkpoints <- data.frame(x=checkpoints, y=sc[checkpoints]) + #checkpoints$label <- paste(checkpoints$x, "=", checkpoints$y) + + print(ggplot() + + ggtitle(main_name) + + + geom_line(data=sc_df, aes(x=x, y=y, colour="BGe+Prior", size=2), na.rm=TRUE) + + geom_line(data=sc_bge_df, aes(x=x, y=y, colour="BGe"), na.rm=TRUE, ) + + geom_line(data=sc_prior_df, aes(x=x, y=y, colour="Prior"), na.rm=TRUE, ) + + + geom_point(data=p_df, aes(x=x, y=y)) + + geom_text(data=p_df, aes(x=x, y=y, label=label), hjust=1, vjust=1) + + geom_point(data=p_df2, aes(x=x, y=y)) + + geom_text(data=p_df2, aes(x=x, y=y, label=label), hjust=-1, vjust=0) + + + #geom_point(data=p_df3, aes(x=x, y=y)) + + #geom_text(data=p_df3, aes(x=x, y=y, label=label), hjust=1, vjust=0) + + #geom_point(data=p_df4, aes(x=x, y=y)) + + #geom_text(data=p_df4, aes(x=x, y=y, label=label), hjust=0, vjust=1) + + geom_hline(yintercept = sc_initial) + + + #geom_point(data=checkpoints, aes(x=x, y=y)) + + #geom_text(data=checkpoints, aes(x=x, y=y, label=label), hjust=0, vjust=0) + + + scale_colour_manual("", + breaks = c("BGe+Prior", "BGe", "Prior"), + values = c("red", "green", "blue")) + + labs() + ) +} + + +plot_graphs <- function(should_normalize = FALSE) +{ + ## BiocManager::install("HilbertVis") + library(HilbertVis) + library(ggplot2) + + files <- list.files("results", pattern = "^I=*") + files <- files[order(nchar(files), files)] + print(files) + #colors <- colors()#c("red", "blue", "black", "green", "orange", "purple") + colors <- rep("red", length(files)) + + for(i in 1:length(files)) + { + f <- files[i] + print(f) + col <- colors[i] + + sc <- readRDS(paste("results/",f,sep="")) + print(head(sc[[1]])) + print(head(sc[[2]])) + plot_vec(sc, f, col, only_start = FALSE, should_normalize = should_normalize) + } +} \ No newline at end of file diff --git a/workflow.Rmd b/workflow.Rmd index 6af5e50b15a60dd818d04623a03218d10a00ebbf..a94ef300f0382d40af0b9f6e86a2b7516b8350ec 100644 --- a/workflow.Rmd +++ b/workflow.Rmd @@ -4,6 +4,22 @@ data <- readRDS("data/precomputed_data.RData") ``` +```{r} +#### If Linux is being used, install the following packages via external package manager: +# (Ubuntu/Debian = apt-get install, etc..) +# 1) libcurl4-openssl-dev +# 2) libpng-dev +# 3) liblzma-dev +# 4) libbz2-dev +# 5) libfontconfig1-dev +# 6) libcairo2-dev +# 7) gfortran +# 8) ibblas-dev +# 9) liblapack-dev +# 10) libudunits2-dev +# 11) libgdal-dev +``` + ## MMPC application ```{r} @@ -37,10 +53,8 @@ parts <- data$parts Include the source code: ```{r, echo=FALSE, warning=FALSE, include=FALSE, error=FALSE,message=FALSE} -library(bnlearn) library(parallel) -suppressWarnings(attach(getNamespace("bnlearn"))) source("source/mmpc_optimized.R") cl <- makeCluster(4) @@ -90,6 +104,8 @@ tbl_expr ## Skeleton-based MMPC Stochastic Search (PiM-SK-SS) ```{r} +# Also need library(RcppArmadillo), but it cannot be libraried, just install it! + source("source/general.R") source("source/skeleton.R") @@ -127,6 +143,18 @@ empirical_int <- SK_SS_tripartite(GE,PK,SK,parts) empirical_int <- readRDS("data/PiM_SK_SS_empirical.RData") ``` +```{r} +empirical_int <- readRDS("data/results/marginal/G_marginal_2.RData") +#empirical_int <- readRDS("data/results/optimB_1.RData") + +empirical_int <- empirical_int[[2]] + +empirical_int2 <- empirical_int +empirical_int[[1]] <- t(empirical_int2[[2]]) +empirical_int[[2]] <- t(empirical_int2[[1]]) + +``` + ## circGPA further analysis #### Push the empirical network into the classical circGPA pipeline @@ -158,25 +186,50 @@ remove(map_mu_circ) > Load circGPA source code and read pre-computed annotation results for three circular RNA with unknown functionality ```{r} +# polynom, geometry, tictoc +# BiocManager: GO.db, multiMiR source("source/circGPA_analysis.R") circname1 <- "hsa_circ_0000227" circname2 <- "hsa_circ_0000228" circname3 <- "hsa_circ_0003793" + +circname4 <-"hsa_circ_0000230" +circname5 <-"hsa_circ_0000231" +circname6 <-"hsa_circ_0000233" + + +all_circs <- c(circname1, circname2, circname3, circname4, circname5, circname6) +all_circs <- c(circname1, circname2, circname3) ``` ```{r} -df1 <- runOnCirc(circname1) -saveRDS(df1, paste("data/", circname1, "_BN.RData", sep="")) -df2 <- runOnCirc(circname2) -saveRDS(df2, paste("data/", circname2, "_BN.RData", sep="")) -df3 <- runOnCirc(circname3) -saveRDS(df3, paste("data/", circname3, "_BN.RData", sep="")) +for(circname_e in all_circs) +{ + df <- runOnCirc(circname_e) + saveRDS(df, paste("data/", circname_e, "_BN.RData", sep="")) +} +reset_to_circGPA_matrix() +for(circname_e in all_circs) +{ + df <- runOnCirc(circname_e) + saveRDS(df, paste("data/", circname_e, "_circGPA.RData", sep="")) +} +``` + +```{r} +reset_to_circGPA_matrix() + +df4 <- runOnCirc(circname4) +saveRDS(df1, paste("data/", circname1, "_circGPA.RData", sep="")) ``` ```{r} -df1 <- readRDS(paste("data/", circname1, "_BN.RData", sep="")) -df2 <- readRDS(paste("data/", circname2, "_BN.RData", sep="")) -df3 <- readRDS(paste("data/", circname3, "_BN.RData", sep="")) +folder <- "data/results/optimB/" +folder <- "data/results/marginal/" + +df1 <- readRDS(paste(folder, circname1, "_BN.RData", sep="")) +df2 <- readRDS(paste(folder, circname2, "_BN.RData", sep="")) +df3 <- readRDS(paste(folder, circname3, "_BN.RData", sep="")) df1_orig <- readRDS(paste("data/", circname1, "_circGPA.RData", sep="")) df2_orig <- readRDS(paste("data/", circname2, "_circGPA.RData", sep="")) df3_orig <- readRDS(paste("data/", circname3, "_circGPA.RData", sep="")) @@ -224,7 +277,8 @@ df2[is.na(df2$pvalue), "pvalue"] <- 1.1 ordering <- match(df2_orig$Row.names, rownames(circGPA2corr)) df2_orig <- df2_orig[ordering,] df2_orig[is.na(df2_orig$pvalue), "pvalue"] <- 1.1 - +# n optimal gene network is obtained from the +# candidate networks obtained in Step3. ordering <- match(df3$Row.names, rownames(circGPA3corr)) df3 <- df3[ordering,] df3[is.na(df3$pvalue), "pvalue"] <- 1.1 @@ -426,11 +480,11 @@ rank1_BN <- rank1_BN[1:100] x = list(orig=rank1_circGPA, corr=rank1_corr, BN=rank1_BN) print(ggVennDiagram(x)) -print(rank1_circGPA[1:3]) +print(rank1_circGPA[1:5]) print("####################################") -print(rank1_corr[1:3]) +print(rank1_corr[1:5]) print("####################################") -print(rank1_BN[1:3]) +print(rank1_BN[1:5]) @@ -478,11 +532,11 @@ rank2_BN <- rank2_BN[1:100] x = list(orig=rank2_circGPA, corr=rank2_corr, BN=rank2_BN) print(ggVennDiagram(x)) -print(rank2_circGPA[1:3]) +print(rank2_circGPA[1:5]) print("####################################") -print(rank2_corr[1:3]) +print(rank2_corr[1:5]) print("####################################") -print(rank2_BN[1:3]) +print(rank2_BN[1:5]) rank2_circGPA <- as.character(df2_orig$Row.names[order(df2_orig$pvalue)]) @@ -523,11 +577,11 @@ rank3_BN <- rank3_BN[1:100] x = list(orig=rank3_circGPA, corr=rank3_corr, BN=rank3_BN) print(ggVennDiagram(x)) -print(rank3_circGPA[1:3]) +print(rank3_circGPA[1:5]) print("####################################") -print(rank3_corr[1:3]) +print(rank3_corr[1:5]) print("####################################") -print(rank3_BN[1:3]) +print(rank3_BN[1:5]) rank3_circGPA <- as.character(df3_orig$Row.names[order(df3_orig$pvalue)]) @@ -557,6 +611,259 @@ ggplot(data = bdata, aes(x=data, y=type)) + geom_boxplot(aes(fill=type)) + xlab( ``` +## Synthetic data test + +```{r} +bigGE <- readRDS("data/precomputed_data.RData") +bigGE <- bigGE$GE +orig_data <- c(bigGE) +orig_data <- unlist(orig_data) +orig_data_Sigmoid <- orig_data +storage.mode(orig_data) <- "numeric" +remove(bigGE) + +#orig_data <- orig_data[!is.na(orig_data)] +#orig_data <- orig_data[orig_data > -50] +#orig_data <- orig_data[orig_data < 50] +``` + +```{r} +source("source/synthetic.R") + +data <- generate_data(N_samples = 100, prior_FP=0.2, prior_FN=0.5, + n_circ = 200, n_mirna = 200, n_mrna = 1000) +data$SigmoidData <- 10^6 / (1 + exp(-data$Data)) +#saveRDS(data, "results/SK/data.RDS") +d1 <- unlist(c(data$SigmoidData)) +d2 <- orig_data_Sigmoid + +hist(d1) +hist(d2) + +b <- min(c(d1,d2)) - 1 # Set the minimum for the breakpoints +e <- max(c(d1,d2)) + 1 # Set the maximum for the breakpoints +ax <- pretty(b:e, n = 12) # Make a neat vector for the breakpoints +#ax <- 0:20 +h1 <- hist(d1, breaks = ax, plot = FALSE) +h2 <- hist(d2, breaks = ax, plot = FALSE) + +plot(h1, col=rgb(0,0,1,1/4)) +plot(h2, col=rgb(0,1,0,1/4), add=T) +legend("topright", c("Generated", "Original"), fill=c(rgb(0,0,1,1/4), rgb(0,1,0,1/4))) + +``` + +```{r} +PK <- data$PK_adjacency +parts <- data$partition +GE <- data$SigmoidData +``` + +```{r echo=FALSE, fig.keep='all', message=FALSE, results='hide', error=FALSE, warning=FALSE} +source("source/mmpc_optimized.R") +source("visualization.R") + + +algorithms <- c( + iamb_fdr=bnlearn::iamb.fdr, + gs=bnlearn::gs, + pc=bnlearn::pc.stable, + hybricPC=bnlearn::hpc, + mmpc=bnlearn::mmpc, + mmpc_tripartite=mmpc_bnlearn_tripartite, + iamb_orig=bnlearn::iamb, + inter_iamb=inter.iamb, + fast_iamb=fast.iamb, + si_hiton_pc=bnlearn::si.hiton.pc +) + +algorithms <-c( + hill_climb=bnlearn::hc, + tabu_search=bnlearn::tabu +) + + +to_test_mets <- c("cor", "mc-cor", "smc-cor", "zf", "mc-zf", "smc-zf", "mi-g", "mc-mi-g", "smc-mi-g", "mi-g-sh") +to_test_mets <- c("loglik-g", "aic-g", "bic-g", "ebic-g", "bge", "pred-loglik-g") + +real <- factor( + c(data$Real_adjacency[parts[[1]], parts[[2]]], data$Real_adjacency[parts[[2]], parts[[3]]]), + levels = c(0, 1)) + + +for(i in 1:length(algorithms)) +{ + alg = algorithms[[i]] + alg_name = names(algorithms)[i] + for(met in to_test_mets) + { + cat(alg_name, ":", met, "\n") + if(alg_name == "mmpc_tripartite") + { + adj <- mmpc_bnlearn_tripartite(data$SigmoidData, partitions = data$partition, test = met) + + } + else{ + adj <- alg(x=data$SigmoidData, test=met) + } + + name_title <- paste(alg_name, "-", met, sep="") + saveRDS(adj, paste("results/SK/SK_algos_sigmoid/", name_title, ".RDS", sep="")) + SK <- amat(adj) + + pred <- factor( + c(SK[parts[[1]], parts[[2]]], SK[parts[[2]], parts[[3]]]), + levels = c(0, 1)) + + draw_confusion_matrix(caret::confusionMatrix(pred,real), title_name = name_title) + } + +} +``` + +```{r} +orig_data <- orig_data / 10^6 +orig_data <- log(orig_data / (1 - orig_data)) +orig_data <- orig_data[orig_data > -40] +orig_data <- orig_data[orig_data < 40] + +d1 <- unlist(c(data$Data)) +d2 <- orig_data + +hist(d1) +hist(d2) + +b <- min(c(d1,d2)) - 1 # Set the minimum for the breakpoints +e <- max(c(d1,d2)) + 1 # Set the maximum for the breakpoints +ax <- pretty(b:e, n = 12) # Make a neat vector for the breakpoints +#ax <- 0:20 +h1 <- hist(d1, breaks = ax, plot = FALSE) +h2 <- hist(d2, breaks = ax, plot = FALSE) + +plot(h1, col=rgb(0,0,1,1/4)) +plot(h2, col=rgb(0,1,0,1/4), add=T) +legend("topright", c("Generated", "Original"), fill=c(rgb(0,0,1,1/4), rgb(0,1,0,1/4))) +``` + +```{r} + +for(i in 1:length(algorithms)) +{ + alg = algorithms[[i]] + alg_name = names(algorithms)[i] + for(met in to_test_mets) + { + cat(alg_name, ":", met, "\n") + if(alg_name == "mmpc_tripartite") + { + adj <- mmpc_bnlearn_tripartite(data$Data, partitions = data$partition, test = met) + + } + else{ + adj <- alg(x=data$Data, test=met) + } + + name_title <- paste(alg_name, "-", met, sep="") + saveRDS(adj, paste("results/SK/SK_algos/", name_title, ".RDS", sep="")) + SK <- amat(adj) + + pred <- factor( + c(SK[parts[[1]], parts[[2]]], SK[parts[[2]], parts[[3]]]), + levels = c(0, 1)) + + draw_confusion_matrix(caret::confusionMatrix(pred,real), title_name = name_title) + } + +} +``` + +```{r, echo=FALSE,warning=FALSE,message=FALSE,error=FALSE, results='hide',fig.keep='all'} +#source("source/skeleton.R") +#SK <- bnlearn::iamb.fdr(data$SigmoidData, undirected=TRUE) +Is = c(10, 20) +#Is = c(60) +ds = c(0.5, 1) +saveRDS(data, "results/MAP/data.RDS") +for(I in Is) +{ + for(dsz in ds) + { + ids <- sample(nrow(data$SigmoidData), dsz * nrow(data$SigmoidData)) + sigData <- data$SigmoidData[ids,] + nonData <- data$Data[ids,] + + #SK <- mmpc_bnlearn_tripartite(x=sigData, undirected = TRUE, partitions = parts) + SK <- iamb.fdr(x=sigData, undirected = TRUE) + SK <- bnlearn::amat(SK) + SK <- data.frame(SK) + fname <- paste("results/MAP/SK_", I, "_", dsz, ".RDS", sep="") + saveRDS(SK, fname) + + ret <- SK_SS_tripartite(sigData,PK,SK,parts=data$partition, I=I) + PK_score <- ret$PK_score + PK_out <- ret$PK_out + storage.mode(PK_out) <- "numeric" + + fname <- paste("results/MAP/PKmap_", I, "_", dsz, ".RDS", sep="") + saveRDS(PK_out, fname) + } +} +``` + +```{r} +library(tidyverse) +source("visualization.R") +pred <- factor( + unlist(c(PK_out[parts[[1]], parts[[2]]], PK_out[parts[[2]], parts[[3]]])), + levels = c(0, 1)) +real <- factor( + c(data$Real_adjacency[parts[[1]], parts[[2]]], data$Real_adjacency[parts[[2]], parts[[3]]]), + levels = c(0, 1)) +confusion_mat <- prop.table(caret::confusionMatrix(pred,real)$table) +#acc <- (confusion_mat %>% diag %>% sum) / sum(confusion_mat) + +acc <- confusion_mat[4] / (confusion_mat[3] + confusion_mat[4]) +cat("accuracy: ", acc * 100, "%\n") + +draw_confusion_matrix(caret::confusionMatrix(pred,real)) +sc <- readRDS("accepted_scores.RData") +plot_vec(sc, "synthetic", "red", only_start = FALSE, should_normalize = TRUE, initial_model = PK_score) +``` + +```{r} +SK <- iamb.fdr(x=sigData, undirected = TRUE) +SK <- amat(SK) +pred <- factor( + unlist(c(SK[parts[[1]], parts[[2]]], SK[parts[[2]], parts[[3]]])), + levels = c(0, 1)) +real <- factor( + c(data$Real_adjacency[parts[[1]], parts[[2]]], data$Real_adjacency[parts[[2]], parts[[3]]]), + levels = c(0, 1)) +confusion_mat <- prop.table(caret::confusionMatrix(pred,real)$table) +#acc <- (confusion_mat %>% diag %>% sum) / sum(confusion_mat) + +acc <- confusion_mat[4] / (confusion_mat[3] + confusion_mat[4]) +cat("accuracy: ", acc * 100, "%\n") + +draw_confusion_matrix(caret::confusionMatrix(pred,real)) +``` + +```{r} +pred <- factor( + c(PK[parts[[1]], parts[[2]]], PK[parts[[2]], parts[[3]]]), + levels = c(0, 1)) +real <- factor( + c(data$Real_adjacency[parts[[1]], parts[[2]]], data$Real_adjacency[parts[[2]], parts[[3]]]), + levels = c(0, 1)) +confusion_mat <- prop.table(caret::confusionMatrix(pred,real)$table) +#acc <- (confusion_mat %>% diag %>% sum) / sum(confusion_mat) + +acc <- confusion_mat[4] / (confusion_mat[3] + confusion_mat[4]) +cat("accuracy: ", acc * 100, "%\n") + +draw_confusion_matrix(caret::confusionMatrix(pred,real)) +``` + ## IntOMICS > **Some explanation:** @@ -569,7 +876,8 @@ ggplot(data = bdata, aes(x=data, y=type)) + geom_boxplot(aes(fill=type)) + xlab( - **PK** -- Prior Knowledge(PK) matrix. It is the table of all *absent* and *present* edges in the databases. For example, our circGPA/MDS task has *tripartite graph* rules: - - Add *absent* edges for every combination of genes except *CIRC -\> MI -\> M* + - Add *absent* edges for every combinaThe prior information can be summarized as\ + a p × p matrix U whose (i, j) element, uij , corrtion of genes except *CIRC -\> MI -\> M* - Add *present* edges for every interaction found in database > #### Import libraries @@ -608,8 +916,15 @@ for(f_path in file_path_vec){source(f_path)} library(ggnewscale) library(ggplot2) library(dplyr) +library(tidyr) library(patchwork) library(igraph) +library(parallel) +library(foreach) +library(iterators) +library(doParallel) +library(bnlearn) +library(bnstruct) ``` #### 5 minute setup @@ -623,9 +938,9 @@ minseglen = 250 #### OR 90 minute setup ```{r} -burn_in = 100000 -thin = 500 -minseglen = 50000 +burn_in = 500 +thin = 100 +minseglen = 200 ``` #### Run the IntOMICS @@ -653,3 +968,14 @@ res_ret <- aut.tuned.MCMC_sampling(burn_in = burn_in, > - **burn_in** -- number of iterations in some part of IntOMICS algorithm. The more it is set -- the more precise it is > > - **thin/minseglen** -- hyper-parameters that are used as threshold for output weights precision. + +```{r} +OMICS_module_res_1 <- OMICS_module(omics = GSE127960_WT_omics, PK = GSE127960_WT_PK, layers_def = GSE127960_WT_layers_def) + +res_ret <- aut.tuned.MCMC_sampling(burn_in = burn_in, + thin = thin, + seed1 = 6782, + seed2 = 375, + OMICS_module_res = OMICS_module_res_1, + minseglen = minseglen) +```