Final changes, only some minor changes to go

3af6a7cd · Anuarbekov, Alikhan · 00358b98 · 3af6a7cd · 3af6a7cd · 3af6a7cd
Commit 3af6a7cd authored 1 year ago by Anuarbekov, Alikhan
--- a/IntOMICS_noimproved/improved_UB.R
+++ b/IntOMICS_noimproved/improved_UB.R
@@ -55,7 +55,7 @@ imprvd_calculate_histogram_of_B_prior <- function(B_prior_mat, coefs)
    nums <- do.call("rbind", apply(B_prior_mat, 2, imprvd_get_counts_per_node, coefs=coefs))
    H <- suppressMessages(
        nums %>% group_by_all() %>% summarise(hist_count = n())
-    )
+        )

    H <- data.frame(H)

@@ -73,8 +73,14 @@ imprvd_calculate_histogram_of_B_prior <- function(B_prior_mat, coefs)
        return(0)
    }

+    if(N <= 3)
+    {
+        K <- 0:N
+    }
+    else{
+        K <- 0:3
+    }

-    K <- 0:N
    choose_term <- choose(N, K)

    term_exp <- N * C  + (1 - 2 * C) * K
@@ -99,7 +105,7 @@ imprvd_calculate_histogram_of_B_prior <- function(B_prior_mat, coefs)
    return(res * count)
 }

-imprvd_log_UB_for_specific_beta <- function(histogram, coefs, beta)
+imprvd_log_UB_for_specific_beta <- function(histogram, coefs, beta, max_fan_in=3)
 {
    ret <- apply(histogram, 1, ..imprvd_calculation_per_set, coefs=coefs, beta=beta)
    ret <- sum(ret)
@@ -131,7 +137,7 @@ imprvd_sample_init_net <- function(empty_net, seed, B_prior)
            {
                picked <- allowed_in_par[sample(length(allowed_in_par), 1)]  # pick 1 of them randomly

-                if(runif(1) > 0.5)
+                if(runif(1) > 0.25)#0.5
                {
                    ret[picked] <- 1
                }
@@ -152,4 +158,291 @@ imprvd_sample_init_net <- function(empty_net, seed, B_prior)
    return( suppressMessages(learn.params(net,dataset_BND)) )
 }

+imprvd_parent_sets_sum_scores_X <- function(selected_node, descendants, parent_set, B_prior)
+{
+    # take all possible in nodes
+    all_allowed_in_parents <- names(which(B_prior[,selected_node] > 0))
+    
+    # remove current children from them
+    all_allowed_in_parents <- all_allowed_in_parents[!(all_allowed_in_parents %in% descendants)]
+    
+    # remove one parent uniformly
+    if(!all(is.na(parent_set)))  # only if not NA
+    {
+      all_allowed_in_parents <- all_allowed_in_parents[all_allowed_in_parents != sample(parent_set, 1)]
+    }
+    
+    
+    # choose how many parents needed
+    K <- length(all_allowed_in_parents)
+    K_need <- sample(0:K, 1)
+    
+    # sample those 
+    sampled_parents <- sample(all_allowed_in_parents, K_need)
+    
+    #return
+    ret <- list(new_parent_set = sampled_parents)
+    
+    if(length(sampled_parents) == 0)
+    {
+      ret$new_parent_set <- NA
+    }
+    #, sum_score_unmarked = 0)
+    #else{
+    #  adj_synt <- matrix(0, ncol(B_prior), 1)
+    #  colnames(adj_synt) <- selected_node
+    #  rownames(adj_synt) <- colnames(B_prior)
+    #  adj_synt[sampled_parents,selected_node] <- 1
+    #  ret$sum_score_unmarked <- BGe_node(selected_node, adj_synt, NA, NA, imprvd_data = imprvd_data)
+    #}
+    
+    return(ret)
+}
+
+imprvd_parent_sets_sum_scores_childrenX <- function(selected_node, children_selected_node, child_order, dag_tmp_bn, B_prior)
+{
+  for(j in child_order)
+  {
+    descendants <- bnlearn::descendants(x = dag_tmp_bn, node = children_selected_node[j])
+    
+    all_allowed_in_parents <- names(which(B_prior[,children_selected_node[j]] > 0))
+    
+    # remove current children from them
+    all_allowed_in_parents <- all_allowed_in_parents[!(all_allowed_in_parents %in% descendants)]  
+    
+    # remove current node
+    all_allowed_in_parents <- all_allowed_in_parents[all_allowed_in_parents != selected_node]
+    
+    # choose how many parents needed
+    K <- length(all_allowed_in_parents)
+    K_need <- sample(0:K, 1)
+    
+    # sample those 
+    sampled_parents <- sample(all_allowed_in_parents, K_need)
+    
+    # add selected node
+    sampled_parents <- c(sampled_parents, selected_node)
+    
+    # update adjacency
+    amat(dag_tmp_bn)[sampled_parents, children_selected_node[j]] <- 1
+  }
+  return(list(dag_tmp_bn = dag_tmp_bn))
+}
+
+
+
+
+
+
+circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
+    r = diameter / 2
+    tt <- seq(0,2*pi,length.out = npoints)
+    xx <- center[1] + r * cos(tt)
+    yy <- center[2] + r * sin(tt)
+    return(data.frame(x = xx, y = yy))
+}
+
+
+determineType <- function(idx, B_prior)
+{
+    in_deg <- sum(B_prior[,idx] > 0)
+    in_deg <- in_deg > 0
+    out_deg <- sum(B_prior[idx,] > 0)
+    out_deg <- out_deg > 0
+
+    # Just a boolean logic for faster computation
+    # IN = 0 and OUT > 0 ----> circ = 1
+    # IN > 0 and OUT > 0 ----> miRNA = 2
+    # IN > 0 and OUT = 0 ----> mRNA = 3
+    return(2 * (in_deg) * xor(in_deg, out_deg) + in_deg + out_deg)
+}
+
+determineYCoordinate <- function(vertex_order)
+{
+    nums <- colMaxs(vertex_order) + 1
+    N_max <- max(vertex_order)
+    coords <- t(t(vertex_order) / nums * N_max)
+    coords <- rowSums(coords)
+    return(coords)
+}
+
+extractAllEdges <- function(idx, I_incidence, B_prior, coords)
+{
+    out_edges <- I_incidence[idx,]
+    out_edges <- which(as.logical(out_edges))
+
+    prior_types <- B_prior[idx, out_edges]
+    prior_types <- prior_types == 0.5
+    prior_types <- ifelse(prior_types, "non-prior edge", "prior edge")
+
+
+    K <- length(out_edges)
+    if(K == 0)
+    {
+        return(NA)
+    }
+    my_coord <- coords[idx,]
+    out_edges <- coords[out_edges,]
+
+    sgm = data.frame(
+        x = rep(as.numeric(my_coord[1]),K),
+        xend = out_edges[,1],
+        y = rep(as.numeric(my_coord[2]),K),
+        yend = out_edges[,2],
+        line_color=prior_types
+    )
+
+    return(sgm)
+}
+
+
+draw_entire_interation_network <- function(I_incidence, B_prior, beta=0, last_beta_move="initial beta", last_move="None: Initial graph", beta_sd=1, GRAPH_SIZE_FACTOR=20)
+{
+    N <- nrow(B_prior)
+
+    if(last_move == "MBR")
+    {
+        last_move <- "Markov blanket resampling"
+    }
+
+    vertex_types <- sapply(1:N, function(idx) determineType(idx, B_prior = B_prior))
+    vertex_order <- sapply(1:3, function(idx) cumsum(vertex_types == idx) * (vertex_types == idx))
+    coords <<- data.frame(x = (vertex_types-1) * GRAPH_SIZE_FACTOR, y = determineYCoordinate(vertex_order))
+    p <- ggplot()
+    p <- p + ggtitle(paste("Last conducted move:\n", last_move, "\n", "Beta value:", beta, "(", last_beta_move, ")\n", "beta sd: ", beta_sd, sep=""))
+
+    # Draw lines representation of interaction edges
+    all_edges <- sapply(1:N, function(idx) extractAllEdges(idx, I_incidence = I_incidence, B_prior = B_prior, coords=coords))
+    all_edges <- all_edges[!is.na(all_edges)]
+    all_edges <- bind_rows(all_edges)
+
+    p <- p + geom_segment(data=all_edges, mapping=aes(x = x, y = y, xend = xend, yend = yend, linetype=line_color), colour="black")
+    p <- p + scale_linetype_manual(
+            values = c("non-prior edge" = "longdash", "prior edge" = "solid"),
+            name="Interaction edge type"
+        )
+
+    # Draw points and the text
+    all_ids <- 1:N
+    rna_names <- c("circRNA", "miRNA", "mRNA")
+
+    p <- p + geom_point(data=coords, mapping=aes(x,y, color=as.factor(rna_names[vertex_types])), size=10)
+    p <- p + xlim(0, 2 * GRAPH_SIZE_FACTOR + 5)
+    p <- p + geom_text(data=coords, mapping=aes(x,y, color=as.factor(rna_names[vertex_types])),
+                       label = ifelse(vertex_types != 3, all_ids, ""), nudge_y = -1, colour="black")
+    p <- p + geom_text(data=coords, mapping=aes(x,y, color=as.factor(rna_names[vertex_types])),
+                       label = ifelse(vertex_types == 3, all_ids, ""), nudge_x = 3, colour="black")
+    p <- p + scale_color_manual(
+        values = list("circRNA" = "white", "miRNA" = "green", "mRNA" = "blue"),
+        name = "RNA type")
+
+
+
+    return(p)
+
+    segment_data = data.frame(
+        x = c(1, 5),
+        xend = c(1, 5),
+        y = c(20, 30),
+        yend = c(30, 40),
+        line_color=c("prior edge", "non-prior edge")
+    )
+    #p <- p + ggplot(data = segment_data, aes(x = x, y = y, xend = xend, yend = yend, colour=line_color))
+    p <- p + new_scale_color() + scale_color_manual(
+        values = list("prior edge" = "red", "non-prior edge" = "green"),
+        name = "Edge type")
+
+    p <- p + geom_segment(data = segment_data, aes(x = x, y = y, xend = xend, yend = yend, colour=line_color))
+
+    return(p)
+}
+
+
+
+gridLayout <- function(N)
+{
+    W <- ceiling(sqrt(N))
+    H <- ceiling(N / W)
+    x_row <- 1:W / W
+
+    X <- rep(x_row, H)
+    Y <- rep(1:H / H, each=W)
+    coords <- matrix(c(X, Y), ncol=2)
+    coords <- coords[1:N,]
+
+    return(coords)
+}
+
+# Draw the structure space
+#
+# target_struct:
+#               NA ---- just color the source
+#               target_struct ----- color
+draw_structure_space <- function(all_parent_set_combos, source_struct, target_struct=NA)
+{
+    L <- length(unlist(all_parent_set_combos))
+
+    printf("Size is %d\n", L)
+    adjmat <- matrix(0, L, L)
+    colnames(adjmat) <- 1:L
+    rownames(adjmat) <- 1:L
+
+    #adjmat[sample(L, 1), sample(L, 1)] <- 1
+
+    G <- graph_from_adjacency_matrix(
+        adjmat
+    )
+
+    V(G)$color <- rep("red", L)
+    V(G)$color[45] <- "green"
+    V(G)$vsize <- rep(3, L)
+    #V(G)$vsize[45] <- 10
+
+    coords <- gridLayout(L)
+
+    plot(G, layout = coords,
+         vertex.size = V(G)$vsize,
+         vertex.color=V(G)$color,
+         vertex.frame.color= "white",
+         vertex.label.color = "white",
+         vertex.label.family = "sans",
+         edge.width=2,
+         edge.color="black",
+         main="Structure space")
+
+    #legend("topleft",bty = "n",
+    #       legend=levels(Group),
+    #       fill=pal, border=NA)
+}
+
+
+
+
+# A which for multidimensional arrays.
+# Mark van der Loo 16.09.2011
+#
+# A Array of booleans
+# returns a sum(A) x length(dim(A)) array of multi-indices where A == TRUE
+#
+# Taken from:
+# https://www.r-bloggers.com/2011/09/a-multidimensional-which-function/
+multi.which <- function(A){
+    if ( is.vector(A) ) return(which(A))
+    d <- dim(A)
+    T <- which(A) - 1
+    nd <- length(d)
+    t( sapply(T, function(t){
+        I <- integer(nd)
+        I[1] <- t %% d[1]
+        sapply(2:nd, function(j){
+            I[j] <<- (t %/% prod(d[1:(j-1)])) %% d[j]
+        })
+        I
+    }) + 1 )
+}
+
+
+
+
+

--- a/data/MMPC_SK.RData
+++ b/data/MMPC_SK.RData
--- a/data/PiM_SK_SS_empirical.RData
+++ b/data/PiM_SK_SS_empirical.RData
--- a/data/final_all_samples_miRNA_seq.xlsx
+++ b/data/final_all_samples_miRNA_seq.xlsx
--- a/data/hsa_circ_0000227-correlated-filtered-short.xlsx
+++ b/data/hsa_circ_0000227-correlated-filtered-short.xlsx
--- a/data/hsa_circ_0000227-expressed-short.xlsx
+++ b/data/hsa_circ_0000227-expressed-short.xlsx
--- a/data/hsa_circ_0000227_circGPA.RData
+++ b/data/hsa_circ_0000227_circGPA.RData
--- a/data/hsa_circ_0000228-correlated-filtered-short.xlsx
+++ b/data/hsa_circ_0000228-correlated-filtered-short.xlsx
--- a/data/hsa_circ_0000228-expressed-short.xlsx
+++ b/data/hsa_circ_0000228-expressed-short.xlsx
--- a/data/hsa_circ_0000228_circGPA.RData
+++ b/data/hsa_circ_0000228_circGPA.RData
--- a/data/hsa_circ_0003793-correlated-filtered-short.xlsx
+++ b/data/hsa_circ_0003793-correlated-filtered-short.xlsx
--- a/data/hsa_circ_0003793-expressed-short.xlsx
+++ b/data/hsa_circ_0003793-expressed-short.xlsx
--- a/data/hsa_circ_0003793_circGPA.RData
+++ b/data/hsa_circ_0003793_circGPA.RData
--- a/data/input_data.RData
+++ b/data/input_data.RData
--- a/source/circGPA/.gitignore
+++ b/source/circGPA/.gitignore
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+.RDataTmp
+
+# User-specific files
+.Ruserdata
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+
+# pkgdown site
+docs/
+
+# translation temp files
+po/*~
+
+# RStudio Connect folder
+rsconnect/
+
--- a/source/circGPA/README.txt
+++ b/source/circGPA/README.txt
+This repository contains source code and data needed to generate outputs
+needed for the following paper:
+
+# TODO the paper citation once published
+Petr Ryšavý, Jiří Kléma, Michaela Dostálová Merkerová
+circGPA: circRNA Functional Annotation Based on Probability-generating Functions
+
+Outputs might by downloaded from: https://ida.fel.cvut.cz/~rysavy/circgpa/
+
+=== Before we start ===
+First, the following packages need to be installed in R. Open R terminal and install using the following commands:
+install.packages("openxlsx")
+install.packages("xlsx")
+install.packages("readr")
+install.packages("pracma")
+install.packages("stringr")
+install.packages("polynom")
+install.packages("geometry")
+install.packages("tictoc")
+BiocManager::install("miRBaseConverter")
+install.packages("GO.db")
+install.packages("org.Hs.eg.db")
+install.packages("httr")
+install.packages("biomaRt")
+install.packages("multiMiR")
+install.packages("msigdbr")
+install.packages("GGally")
+install.packages("network")
+install.packages("sna")
+Next, download the source code with the associated graph. There is no need to build your own graph. A graph of interactions is included in the repository. To download the code, call in bash:
+git clone https://github.com/petrrysavy/circgpa-paper.git
+
+==== Run with the default graph ====
+The repository you just downloaded comes with the graph of interactions used in the paper's experiments. To generate the output, go to bash and call
+> ./circgpa-paper/run.sh hsa_circ_0000228
+Some of the outputs that can be generated using this command are available on https://ida.fel.cvut.cz/~rysavy/circgpa/.
+
+==== Run with a custom graph ====
+If you want to run the circGPA algorithm on your own interaction graph, you need to provide four inputs (assuming a fixed ordering on miRNAs and mRNAs):
+* Ammu - the adjacency matrix between the mRNAs and miRNAs. Row m, column mu is 1 iff mRNA m interacts with muRNA mu, 0 otherwise.
+* Amuc - a binary vector, where 1 at position mu indicates that the miRNA mu interacts with the circRNA c of interest.
+* gom - a binary vector of annotations of mRNAs - 1 if the mRNA m is annotated, 0 otherwise.
+* gomu - a binary vector of annotations of miRNAs. Similar to gom.
+To run the code on the toy network from the paper, go to R in the circgpa-paper folder and call:
+source('annotate.R')
+gomu <- c(0,1,1);
+gom <- c(1,1,1,0,0);
+Amuc <- c(1,1,1);
+Ammu <- matrix(c(0,1,1,1,1,1,0,0,1,1,0,0,1,1,0), nrow=5, ncol=3, byrow=TRUE);
+annotateVectorized(Amuc , gomu, gom, Ammu)
+
+==== Known limitations ====
+The algorithm requires a long double value with more exponent bits than common on regular desktop computers. The used long double type needs to accommodate (together with some room for operations with them) binomial coefficients up to the number of mRNAs over the size of the annotation term. See pvalue.cpp. 
+
+==== Sources of the data for the default interaction graph ====
+The included RData and CSV files contain a snapshot of interaction graph obtained
+from the following databases:
+
+[1] CircInteractome (https://circinteractome.nia.nih.gov/index.html)
+
+Dudekula DB, Panda AC, Grammatikakis I, De S, Abdelmohsen K, and Gorospe M.
+CircInteractome: A web tool for exploring circular RNAs and their interacting
+proteins and microRNAs. RNA Biology, 2016, Jan 2;13(1):34-42
+
+
+[2] multiMiR R package (http://multimir.org/)
+
+Yuanbin Ru*, Katerina J. Kechris*, Boris Tabakoff, Paula Hoffman, Richard A. Radcliffe,
+Russell Bowler, Spencer Mahaffey, Simona Rossi, George A. Calin, Lynne Bemis,
+and Dan Theodorescu. (2014) The multiMiR R package and database: integration
+of microRNA-target interactions along with their disease and drug associations.
+Nucleic Acids Research, doi: 10.1093/nar/gku631.
+
+[3] ENA Quick GO database (https://www.ebi.ac.uk/QuickGO/)
+
+[4] MSigDB database C5 cathegory (http://www.gsea-msigdb.org/gsea/msigdb/)
+
+Subramanian, A., Tamayo, P., Mootha, V. K., Mukherjee, S., Ebert, B. L., Gillette,
+M. A., ... & Mesirov, J. P. (2005). Gene set enrichment analysis: a knowledge-based
+approach for interpreting genome-wide expression profiles. Proceedings of the National
+Academy of Sciences, 102(43), 15545-15550.
--- a/source/circGPA/annotate.R
+++ b/source/circGPA/annotate.R
+# This script should calculate annotation of a circRNA with GO terms
+
+library(polynom)
+library(geometry) # only for the dot function
+library(tictoc)
+source('buildGraph.R')
+source('pValue.R')
+
+setClass("annotationResult",
+    slots=list(
+        score="numeric",
+        pvalue="numeric",
+        scorenorm="numeric",
+        expectedScore="numeric",
+        time="numeric",
+        pvalueTime="numeric",
+        bootstrap="numeric",
+        bootstrapTime="numeric",
+        goSize="numeric"
+    )
+)
+print.annotationResult <- function(x) { cat(paste("Normalized score:", x@scorenorm, "\n")); cat(paste("p-value:", x@pvalue, "\n")); }
+
+Im <- buildIm()
+
+annotateCirc <- function(circRNA, goTermList, goLower = -1, goUpper = Inf) {
+    muCirc <- buildMuCirc(circRNA)
+    return(sapply(goTermList, function(goTerm) {
+        tryCatch({
+            print(paste("Probing GO term:", goTerm))
+            goMu <- buildGOMu(goTerm)
+            goM <- buildGOM(goTerm)
+            goSize <- sum(goMu) + sum(goM)
+            if(goSize > goUpper | goSize < goLower) { return(NULL) }
+            sol <- annotateVectorized(muCirc, goMu, goM, Im)
+            print(sol)
+            return(sol)
+        }, error=function(cond) {
+            print(cond)
+            return(NULL)
+        })
+    }))
+}
+
+annotate <- function(circRNA, goTerm) {
+    return(annotateVectorized(buildMuCirc(circRNA), buildGOMu(goTerm), buildGOM(goTerm), Im))
+}
+
+# dataVector = vector with counts how many times each muRNA/mRNA is connected to the circRNA
+buildPolynomial <- function(dataVector) {
+    tbl <- as.data.frame(table(factor(dataVector, levels = 0:max(dataVector))))
+    return(polynomial(coef = tbl[,2]))
+}
+
+# uniform probability vector for goMu or goM - vec(1)*sum(arr) / len(arr)
+uniformize <- function(goVec) {
+    return(rep(1, length(goVec)) * sum(goVec) / length(goVec))
+}
+
+pValueSimple <- function(muCirc, goMu, goM, ImmuCirc, score) {
+    # get the p-value, convert the muCirc vector to the polynomial
+    muCircPoly <- buildPolynomial(muCirc)
+    ImMuCircPoly <- buildPolynomial(ImmuCirc)
+    # now calculate power of those to get the number of ways
+    generatingPoly <- muCircPoly^sum(goMu) * ImMuCircPoly^sum(goM)
+    generatingCoefs <- coefficients(generatingPoly)
+    # the p-value is how many of the combinations gave higher score than we have
+    return(sum(generatingCoefs[(score+1):length(generatingCoefs)]) / sum(generatingCoefs)) # score + 1 as coefficients are x^0 + x^1 indexed from 1
+}
+
+# goMu - 0/1 vector of is mu in goTerm
+# goM the same is mRNA in goTerm
+# muCirc - 0/1/more vecotor of circrna neighborhood
+# Im - matrix with interactions between muRNA nad miRNA
+# test:
+# gomu <- c(0,1,1); gom <- c(1,1,1,0,0);  mucirc <- c(1,1,1); im <- matrix(c(0,1,1,1,1,1,0,0,1,1,0,0,1,1,0), nrow=5, ncol=3, byrow=TRUE); annotateVectorized(mucirc, gomu, gom, im)
+annotateVectorized <- function(muCirc, goMu, goM, Im) {
+    time <- as.numeric(system.time({
+        ImmuCirc <- Im %*% muCirc
+        score <- dot(muCirc, goMu) + dot(ImmuCirc, goM) # ignore the halfs, I love whole numbers
+        pvalueTime <- as.numeric(system.time({
+            pvalue <- pValueFull(muCirc, goMu, goM, ImmuCirc, score)
+        })['elapsed'])
+        bootstrapTime <- as.numeric(system.time({
+            bootstrap <- NaN #pValueBootstrap(muCirc, goMu, goM, ImmuCirc, score) # NaN #
+        })['elapsed'])
+        expectedScore <- dot(muCirc, uniformize(goMu)) + dot(ImmuCirc, uniformize(goM))
+        scorenorm <- score / expectedScore
+        goSize <- sum(goM) + sum(goMu)
+    })['elapsed'])
+    # return
+    return(new("annotationResult",
+        score=score, pvalue=pvalue, scorenorm=scorenorm, expectedScore=expectedScore, time=time, pvalueTime=pvalueTime, bootstrap=bootstrap, bootstrapTime=bootstrapTime, goSize=goSize
+    ))
+}
--- a/source/circGPA/bootstrap.cpp
+++ b/source/circGPA/bootstrap.cpp
+#include "bootstrap.h"
+
+#include <Rcpp.h>
+#include <cstdlib>
+#include <cstring>
+
+void sample(bool* arr, int size, int n) {
+    int pos;
+    while(n != 0) {
+        do { pos = (rand() % size); } while(arr[pos]);
+        arr[pos] = true;
+        --n;
+    }
+}
+
+int dot(IntegerVector counts, bool* index) {
+    int product = 0;
+    for(IntegerVector::iterator i = counts.begin(); i != counts.end(); ++i, ++index)
+        product += (*i) * ((int) (*index));
+    return product;
+}
+
+// test: pvalue_bootstrap(c(1,1,1), 2, c(2,3,1,1,2), 3, 8, 1000)
+// [[Rcpp::export]]
+double pvalue_bootstrap(IntegerVector weightsMu, int goMuSize, IntegerVector weightsM, int goMSize, int score, int trials) {
+    const int muSize = weightsMu.size();
+    const int mSize = weightsM.size();
+    bool* goMu = new bool[muSize];
+    bool* goM = new bool[mSize];
+    
+    int positive = 0;
+    for(int i = 0; i < trials; ++i) {
+        memset(goMu, 0, muSize * sizeof(bool));
+        memset(goM, 0, mSize * sizeof(bool));
+        
+        sample(goMu, muSize, goMuSize);
+        sample(goM, mSize, goMSize);
+        
+        const int sample = dot(weightsMu, goMu) + dot(weightsM, goM);
+        if(sample >= score) ++positive;
+    }
+    
+     delete[] goMu;
+     delete[] goM;
+    
+    return ((double) positive + 1) / trials + 1;
+}
--- a/source/circGPA/bootstrap.h
+++ b/source/circGPA/bootstrap.h
+#ifndef GUARD_bootstrap_h
+#define GUARD_bootstrap_h
+
+#include<Rcpp.h>
+
+using namespace std;
+using namespace Rcpp;
+
+void sample(bool* arr, int size, int n);
+
+int dot(IntegerVector counts, bool* index);
+
+double pvalue_bootstrap(IntegerVector weightsMu, int goMuSize, IntegerVector weightsM, int goMSize, int score, int trials);
+
+#endif
--- a/source/circGPA/bootstrap_spearman_experiment/bootstrapSpearmanExperiment.R
+++ b/source/circGPA/bootstrap_spearman_experiment/bootstrapSpearmanExperiment.R
+# sbatch -p cpulong run.sh hsa_circ_0000228
+args = commandArgs(trailingOnly=TRUE)
+circname <- args[1]
+
+source('../annotate.R', chdir=TRUE)
+source('../goterms.R', chdir=TRUE)
+
+# goTermNames <- c('GO:0045595', 'GO:0014013')
+# annotateCirc('hsa_circ_0000228')
+
+library("openxlsx")
+
+library('pracma')
+library('Rcpp')
+library('ggpubr')
+library('readr')
+sourceCpp('bootstrapSpearmanExperiment.cpp')
+
+# sampling a log-uniform distribution, then convert to integers
+# ... this will be our points of interrest for the plot
+samplePoints <- c(1:99, logspace(2, 6, 901))
+samplePoints <- as.integer(samplePoints)
+
+annotateLog <- function(muCirc, goMu, goM, Im) {
+    ImmuCirc <- Im %*% muCirc
+    score <- dot(muCirc, goMu) + dot(ImmuCirc, goM) # ignore the halfs, I love whole numbers
+    pvalue <- pValueFull(muCirc, goMu, goM, ImmuCirc, score)
+    if(score == 0) { bootstrap <- rep(1.0, length(samplePoints)) }
+    else {  bootstrap <- pvalue_bootstrap_logging(muCirc, sum(goMu), ImmuCirc, sum(goM), score, samplePoints) }
+    return(list(pvalue=pvalue, bootstrap=bootstrap))
+}
+
+annotateCirc <- function(circRNA) {
+    muCirc <- buildMuCirc(circRNA)
+    sapply(goTermNames, function(goTerm) {
+        tryCatch({
+            print(paste("Probing GO term:", goTerm))
+            goMu <- buildGOMu(goTerm)
+            goM <- buildGOM(goTerm)
+            return(annotateLog(muCirc, goMu, goM, Im))
+        }, error=function(cond) {
+            print(cond)
+            return(NULL)
+        })
+    })
+}
+
+res <- annotateCirc(circname)
+saveRDS(res, file=paste(circname, ".RDS", sep=""))
+# res <- readRDS('hsa_circ_0000228.RDS')
+# sapply(res["bootstrap",], function(x) unlist(x)[999])
+#  unlist(res["pvalue",])
+
+#unlisted <- unlist(sapply(names(res), function(key) unlist(res[key])))
+
+
+#pval <- unlist(sapply(names(res), function(key) as.numeric(unlisted[paste(key, '.', key, '.pvalue', sep="")])))
+#pval <- unlist(sapply(colnames(res), function(key) as.numeric(unlist(res[,key])[paste(key, '.pvalue', sep="")])))
+pval <- unlist(sapply(colnames(res), function(key) as.numeric(unlist(res[,key]$pvalue))))
+nona <- !is.na(pval)
+#pval <- unlist(res["pvalue",])
+
+#getCorrelation <- function(i) {
+#    boot <- sapply(res["bootstrap",], function(x) unlist(x)[i])
+#    return(cor(pval, boot, method="spearman"))
+#}
+
+getCorrelation <- function(i) {
+    print(i)
+    boot <- unlist(sapply(colnames(res), function(key) as.numeric(unlist(res[,key]$bootstrap[i]))))
+    return(cor(pval[nona], boot[nona], method="spearman"))
+}
+
+spearmans <- sapply(1:length(samplePoints), function(i) {getCorrelation(i)})
+write_lines(spearmans, paste(circname, "-spearmans.dat", sep=""))
+# write_lines(spearmans, paste("fsadfaa", ".dat", sep=""))
+
+pval2 <- pval[pval<0.5]
+nona2 <- !is.na(pval2) & !is.na(names(pval2))
+pval2 <- pval2[nona2]
+getCorrelation2 <- function(i) {
+    print(i)
+    boot <- unlist(sapply(names(pval2), function(key) as.numeric(unlist(res[,key]$bootstrap[i]))))
+    return(cor(pval2, boot, method="spearman"))
+}
+spearmans <- sapply(1:length(samplePoints), function(i) {getCorrelation2(i)})
+write_lines(spearmans, paste(circname, "-spearman2.dat", sep=""))
+
+
+pval3 <- pval[pval<0.05]
+nona3 <- !is.na(pval3) & !is.na(names(pval3))
+pval3 <- pval3[nona3]
+getCorrelation3 <- function(i) {
+    print(i)
+    boot <- unlist(sapply(names(pval3), function(key) as.numeric(unlist(res[,key]$bootstrap[i]))))
+    return(cor(pval3, boot, method="spearman"))
+}
+spearmans <- sapply(1:length(samplePoints), function(i) {getCorrelation3(i)})
+write_lines(spearmans, paste(circname, "-spearman3.dat", sep=""))
+