diff --git a/source/skeleton.R b/source/skeleton.R index 877ee23b443c8807c36f2aaa77066d7f83a02290..6bef13f099777a02a8d5116d96351c9736915f1f 100644 --- a/source/skeleton.R +++ b/source/skeleton.R @@ -380,10 +380,11 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be # this is a large computation that computes a large parameter matrix: # N x N, where N is the sum of sizes, e.g., total number of genes cat("Computing large BGe matrix...\n") - if(TRUE) + if(FALSE) # TODO remove { myScore <- scoreparameters_BiDAG_BGe(n = ncol(GE), data = GE) param_in <- list(n=myScore$n, awpN=myScore$awpN, TN=myScore$TN, scoreconstvec=myScore$scoreconstvec) + GLOBAL_PARAM_IN <<- param_in remove(myScore) } else{ @@ -393,13 +394,13 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be } cat("TN matrix have a size of: ", dim(param_in$TN), "\n") - + set.seed(0) start.time <- Sys.time() print(start.time) - E_f_G_out <- c_SK_SS(PK_in,SK_in, sizes, param_in, + G_out <- c_SK_SS(PK_in,SK_in, sizes, param_in, I, debug, beta_L, beta_H, score_G, score_change_G) @@ -408,10 +409,8 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be print(end.time) print(end.time - start.time) - print(E_f_G_out) - - E_f_G_out <- lapply(E_f_G_out, function(m) m > alpha) - + print(G_out) + saveRDS(G_out, "G_out_1_iter.RData") # TODO convert to NxN matrix form return(E_f_G_out) diff --git a/source/skeleton.cpp b/source/skeleton.cpp index 062a41dc2b59808e22e221a06a7a2eb690055bc7..4b24bc144bb922956efa392c064305d68df7d944 100644 --- a/source/skeleton.cpp +++ b/source/skeleton.cpp @@ -61,8 +61,7 @@ typedef unsigned long long uint64_t; typedef std::tuple<uint32_t, uint32_t, uint32_t> key_t; /** - * Hashing of the triplet. Uses the simple XOR, - * TODO for future: replace with more sophisticated hashing, possible duplicates + * Hashing of the triplet. Uses the simple boost libraries' implementation of tuple hash */ struct key_hash : public std::unary_function<key_t, std::size_t> { @@ -74,10 +73,6 @@ struct key_hash : public std::unary_function<key_t, std::size_t> ret ^= std::hash<uint32_t>()(std::get<2>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2); return ret; - - // return (std::size_t)(std::get<0>(k)) ^ - // (std::size_t)(std::get<1>(k)) ^ - // (std::size_t)(std::get<2>(k)); } }; @@ -182,17 +177,6 @@ struct GraphChange this->is_graph = true; } - void connect(GraphChange* parent, GraphChange* children) - { - this->prev = parent; - if(children != nullptr) - { - // TODO assert children->prev == parent - this->next_map[children->key()] = children; - children->prev = this; - } - } - key_t key() { return std::make_tuple(this->listID, this->from, this->to); @@ -270,13 +254,6 @@ public: bool appendChainAddOnly(std::vector<key_t>& word) { if(word.size() == 0) return false; - - /* - Rcout << "adding a word:\n"; - for(key_t k : word) - Rcout << "["<< (uint32_t)std::get<0>(k) << "," << (uint32_t)std::get<1>(k) << "," << (uint32_t)std::get<2>(k) << "]"; - Rcout<<"\n"; - */ // Now it is easier to add a word to trie: // since characters are sorted, just traverse and add individual nodes @@ -284,6 +261,8 @@ public: if(this->root.find(word[0]) == this->root.end()) { current = new GraphChange(word[0], nullptr); + this->total_num_nodes++; + root[word[0]] = current; } else{ @@ -307,9 +286,12 @@ public: { // Create and append all the missing nodes current->next_map[word[index]] = new GraphChange(word[index], current); + this->total_num_nodes++; + for(;index < word.size();index++) { current->next_map[word[index]] = new GraphChange(word[index], current); + this->total_num_nodes++; current = current->next_map[word[index]]; } @@ -358,15 +340,11 @@ public: return this->total_num_graphs; } - - void print() + uint64_t nodes_num() { - - for(const auto & [ key, value ] : this->root) - { - this->printRecursive(value); - } + return this->total_num_nodes; } + protected: /** @@ -384,17 +362,7 @@ protected: root; - void printRecursive(GraphChange* curr) - { - Rcout << "recursive:\n"; - curr->printG(); - Rcout << "going down:\n"; - for(const auto & [ key, value ] : curr->next_map) - { - this->printRecursive(value); - } - Rcout << "going up:\n"; - } + @@ -402,13 +370,19 @@ protected: // TODO assert total_num_graphs < INT_MAX uint32_t total_num_graphs = 0; + uint64_t total_num_nodes = 0; void find(uint32_t* currIdx, const uint32_t& needed, std::vector<LogicalMatrix*>& G_adjs) { // Note: if the sampled subgraphs are too deep and stack memory overflow, // then consider using the non-recursive version - for(const auto & [ key, value ] : this->root) - { + + //for(const auto & [ key, value ] : this->root) + //{ + for(const auto &item : this->root) + { + const auto& value = item.second; + // Go down value->forward(G_adjs); *currIdx += value->is_graph; @@ -435,8 +409,12 @@ protected: curr->forward(G_adjs); currIdx += curr->is_graph; - for(const auto & [ key, value ] : curr->next_map) - { + //for(const auto & [ key, value ] : curr->next_map) + //{ + for(const auto &item : this->root) + { + const auto& value = item.second; + this->recursiveFind(value, currIdx, needed, G_adjs); if(*currIdx == needed) { @@ -450,46 +428,6 @@ protected: }; - - - - - -// [[Rcpp::export]] -double tst() -{ - Trie trie; - trie.print(); - std::vector<key_t> word; - - - word.push_back(std::make_tuple(0, 5, 10)); - trie.appendChainAddOnly(word); - trie.print(); - - word.push_back(std::make_tuple(0, 7, 10)); - trie.appendChainAddOnly(word); - trie.print(); - - word.push_back(std::make_tuple(0, 5, 11)); - trie.appendChainAddOnly(word); - trie.print(); - - word.push_back(std::make_tuple(0, 4, 10)); - trie.appendChainAddOnly(word); - trie.print(); - - word.erase(word.begin() + 0); - trie.appendChainAddOnly(word); - trie.print(); - - Rcout << "Final trie size: " << trie.total_num() << "\n"; - - - return 0.0; -} - - struct IData { const List* PK; @@ -512,16 +450,10 @@ struct ISettings class Network { protected: - uint32_t N; // number nodes - IData input_data; ISettings settings; Trie V_G; - - - std::vector<LogicalMatrix*> G; - /** * sort individual elements from lowest to largest by unordered_set * this will be used to push to TRIE -> count subgraphs @@ -529,36 +461,21 @@ protected: std::vector<key_t> ord_V_G; /** - * store 2 count matrices: - * > information about the current iteration counts only - * > after each iteration do: - * >> normalize current_counts by 100*I to prevent overflow - * >> E_f_G += current_counts; - * - * TODO: - * also store 1 last probability to multiply our counts? + * store the current adjacency graph and the best MAP model found during the search + * > update only when better model is found */ - std::vector<NumericMatrix*> E_f_G; - std::vector<NumericMatrix*> current_counts; - double currentProb = 1.0; // TODO - NumericVector initialProbs; - + std::vector<LogicalMatrix*> G; - double bge_max = -99999999999.0; - double bge_min = 9999999999.0; - double energy_max = -99999999999.0; - double energy_min = 9999999999.0; + double probabilityMaxOptim = -99999999999.0; + std::vector<LogicalMatrix*> G_optim; - double prob_max = -99999999999.0; - double prob_min = 9999999999.0; public: Network(IData input, ISettings settings) { this->input_data = input; this->settings = settings; - this->N = Rcpp::sum(*input.sizes); - + this->initializeLists(); } @@ -580,8 +497,7 @@ public: void SK_SS() { this->partialSK_SS(); - return; // TODO remove - + for(uint32_t i = 0; i < this->settings.I; i++) { Rcout << "SK-SS, Iteration: " << i << "\n"; @@ -592,18 +508,15 @@ public: List& result() { - Rprintf("BGe min/max: %.10f, %.10f\n", this->bge_min, this->bge_max); - Rprintf("Energy min/max: %.10f, %.10f\n", this->energy_min, this->energy_max); - Rprintf("Prob min/max: %.10f, %.10f\n", this->prob_min, this->prob_max); - Rprintf("Initial probs: %.10f, %.10f\n", this->initialProbs[0], this->initialProbs[1]); + Rprintf("Optimum log probability: %.10f\n", this->probabilityMaxOptim); + Rprintf("Trie total size: %.5d\n", this->V_G.total_num()); const List* SK_list = this->input_data.SK; List* ret = new List(SK_list->length()); for(uint32_t i = 0; i < SK_list->length(); i++) { - *this->E_f_G[i] = (*this->E_f_G[i]); // / (this->V_G.total_num() / 100.0) - (*ret)[i] = *this->E_f_G[i]; + (*ret)[i] = *this->G_optim[i]; } return *ret; @@ -642,17 +555,20 @@ protected: uint32_t r = lin % sk.nrow(); uint32_t c = lin / sk.nrow(); - Rcout << "\014" << - i << "," << j << "," << k << " /" << - (SK_list->length()) << "," << (uint32_t)sk.ncol() << "," << (uint32_t)sk.nrow() << - "\n" << "Trie size: " << this->V_G.total_num(); + Rprintf("\014[pSK-SS][add only][%5u,%5u,%5u]/[%5u,%5u,%5u]\n" + "Probability optim: %.10f\nTrie: nodes: %.10lu\n, subgraphs %.5u\n", + i,j,k, + SK_list->length(), (uint32_t)sk.ncol(), (uint32_t)sk.nrow(), + this->probabilityMaxOptim, this->V_G.nodes_num(), this->V_G.total_num()); // if inside G_0 -> skip if((*this->G[i])(r,c) != 0) continue; // try add an edge outside of G_0 - this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD); + // do not update V_G since we want to sample from partialSK-SS only! + // because of that -> shouldUpdateTrie = false + this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD, false); } } } @@ -662,9 +578,6 @@ protected: void partialSK_SS_iter() { NumericVector twoScores = this->callScoreFull(); - this->initialProbs = twoScores; - - const List* SK_list = this->input_data.SK; // for loop over 1...N @@ -676,7 +589,8 @@ protected: // Random permutation of the indices uint32_t num = sk.ncol() * sk.nrow(); - IntegerVector xy = Rcpp::sample(num, num); + //IntegerVector xy = Rcpp::sample(num, num); + IntegerVector xy = Rcpp::Range(1, num); // TODO remove // It is necessary to do 2d loop in COL, ROW order, since // rows are local in memory by Rcpp standard @@ -691,13 +605,11 @@ protected: uint32_t r = lin % sk.nrow(); uint32_t c = lin / sk.nrow(); - Rcout << "\014" << - i << "," << j << "," << k << " /" << - (SK_list->length()) << "," << (uint32_t)sk.ncol() << "," << (uint32_t)sk.nrow() << - "\n" << "Trie size: " << this->V_G.total_num() << "\n" - << "BGe: max:" << this->bge_max << ", min:" << this->bge_min << "\n" - << "prob: " << this->currentProb << "\n"; - + Rprintf("\014[pSK-SS][add only][%5u,%5u,%5u]/[%5u,%5u,%5u]\n" + "Probability optim: %.10f\nTrie: nodes: %.10lu\n, subgraphs %.5u\n", + i,j,k, + SK_list->length(), (uint32_t)sk.ncol(), (uint32_t)sk.nrow(), + this->probabilityMaxOptim, this->V_G.nodes_num(), this->V_G.total_num()); if(sk(r,c) == 0) continue; @@ -707,8 +619,7 @@ protected: } } - return; // TODO remove - + // for loop over 1...N // for loop over neighbors // try remove/add edge @@ -734,10 +645,11 @@ protected: uint32_t c = lin / gz.nrow(); - Rcout << "\014" << - i << "," << j << "," << k << " /" << - (SK_list->length()) << "," << (uint32_t)gz.ncol() << "," << (uint32_t)gz.nrow() << - "\n" << "Trie size: " << this->V_G.total_num(); + Rprintf("\014[pSK-SS][add only][%5u,%5u,%5u]/[%5u,%5u,%5u]\n" + "Probability optim: %.10f\nTrie: nodes: %.10lu\n, subgraphs %.5u\n", + i,j,k, + SK_list->length(), (uint32_t)gz.ncol(), (uint32_t)gz.nrow(), + this->probabilityMaxOptim, this->V_G.nodes_num(), this->V_G.total_num()); if(gz(r,c) == 0) // if not in G -> try add @@ -747,13 +659,6 @@ protected: } } } - - // Normalize counts by 100 * iteration number - for(uint32_t i = 0; i < SK_list->length(); i++) - { - *this->current_counts[i] = (*this->current_counts[i]); // / (this->settings.I * 100.0) - *this->E_f_G[i] += *this->current_counts[i]; - } } @@ -782,10 +687,6 @@ protected: *(this->G)[i] = LogicalMatrix(sk.nrow(), sk.ncol()); Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); - - *(this->current_counts)[i] = NumericMatrix(sk.nrow(), sk.ncol()); - Rcpp::rownames(*(this->current_counts)[i]) = Rcpp::rownames(sk); - Rcpp::colnames(*(this->current_counts)[i]) = Rcpp::colnames(sk); } } @@ -807,13 +708,9 @@ protected: Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); - (this->E_f_G).push_back(new NumericMatrix(sk.nrow(), sk.ncol())); - Rcpp::rownames(*(this->E_f_G)[i]) = Rcpp::rownames(sk); - Rcpp::colnames(*(this->E_f_G)[i]) = Rcpp::colnames(sk); - - (this->current_counts).push_back(new NumericMatrix(sk.nrow(), sk.ncol())); - Rcpp::rownames(*(this->current_counts)[i]) = Rcpp::rownames(sk); - Rcpp::colnames(*(this->current_counts)[i]) = Rcpp::colnames(sk); + (this->G_optim).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->G_optim)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G_optim)[i]) = Rcpp::colnames(sk); } } @@ -827,92 +724,57 @@ protected: for(uint32_t i = 0; i < SK_list->length(); i++) { delete this->G[i]; - delete this->E_f_G[i]; - delete this->current_counts[i]; + delete this->G_optim[i]; } this->G.clear(); - this->E_f_G.clear(); - this->current_counts.clear(); + this->G_optim.clear(); } - void tryPerformMove(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) + void tryPerformMove(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action, + bool shouldUpdateTrie = true) { - /* - Rcout << "Trying to "; - if(action == ChangeType::ADD) - { - Rcout << "ADD"; - } - else{ - Rcout << "REMOVE"; - } - Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; - */ + // Compute the possible score NumericVector scoreChanges = this->callScoreChange(score, listID, from, to, action); - double bge = scoreChanges[1]; - double energy = scoreChanges[0]; - double prob = bge + energy; - - if(bge > this->bge_max) this->bge_max = bge; - if(bge < this->bge_min) this->bge_min = bge; - if(energy > this->energy_max) this->energy_max = energy; - if(energy < this->energy_min) this->energy_min = energy; - if(prob > this->prob_max) this->prob_max = prob; - if(prob < this->prob_min) this->prob_min = prob; - /* - * Here I tested multiple approaches to find the difference, because: - * We are dealing with ~ -1e+08 or even more, thus - * straightforward computation may overflow/underflow(when exp() is taken) - * - * The final choice -> compute difference in each score(BGe/Energy) separately - * in each scoring function. For reference see <score_change(...)> R function - */ - double mod_diff = (scoreChanges[3] + scoreChanges[4]); - // Rcout << mod_diff << "\n"; - /** * Modified(explicit per-partes) is used, e.g.: * * since we are working in small precisions + log scale: - * > difference is calculated in SCORE functions - * > difference is separated for BGe / Energy part - * > ONLY difference is converted by exp, since it is tractable! - * > try checking BGe score for large graph, it is ~ -55000 in log form + * > difference is calculated in SCORE functions manually + * > difference is separately calculated for BGe / Energy part + * > the previous is because they have orders of magnitude differences + * > ONLY afterwards the summed difference is converted by exp, since it is tractable! */ + double mod_diff = (scoreChanges[3] + scoreChanges[4]); double diff = exp(mod_diff); + // Convert the log difference to B/A and normalize it + // normalize(1, B/A) = normalize(A,B) + // that way we get probabilities for two candidates! NumericVector probs = {1.0, diff}; probs = probs / Rcpp::sum(probs); IntegerVector ids = {0, 1}; - IntegerVector ret = Rcpp::sample(ids, 1, probs = probs); + // If the first was sampled, skip if(ret[0] != 1) return; + // otherwise update current state: + + // Update score *score = scoreChanges; - this->makeMove(listID, from, to, action); - - // Is it appropriate? - this->currentProb *= diff; + // Update G if not in SK-SS part + this->makeMove(listID, from, to, action, shouldUpdateTrie); - // TODO remove debug - /* - if(action == ChangeType::ADD) - { - Rcout << "Adding"; - } - else{ - Rcout << "Removing"; - } - Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; */ + // Update optimum + this->CheckVisitedGraphOptimum(scoreChanges); } - void makeMove(uint32_t listID, uint32_t from, uint32_t to, ChangeType action) + void makeMove(uint32_t listID, uint32_t from, uint32_t to, ChangeType action, bool shouldUpdateTrie) { // update adjacency (*(this->G[listID]))(from,to) = action == ChangeType::ADD; @@ -927,25 +789,38 @@ protected: erase_sorted(this->ord_V_G, kt); } + // If SK-SS part -> don't update Trie + if(!shouldUpdateTrie) + { + return; + } + // Try to add word to Trie - bool shouldAdd = this->V_G.appendChainAddOnly(this->ord_V_G); + this->V_G.appendChainAddOnly(this->ord_V_G); - // If new subgraph found, update edge counts - if(shouldAdd) this->AddCounts(); + //bool isNewSubGraph = } - - void AddCounts() + void CheckVisitedGraphOptimum(NumericVector& scoreChanges) { - const List* SK_list = this->input_data.SK; - for(uint32_t i = 0; i < SK_list->length(); i++) - { - NumericMatrix tmp = Rcpp::as<NumericMatrix>(*this->G[i]) * currentProb; - *this->current_counts[i] += tmp; - } + double bge = scoreChanges[1]; + double energy = scoreChanges[0]; + double probabilityMovedG = bge + energy; + + if(probabilityMovedG > this->probabilityMaxOptim){ + this->probabilityMaxOptim = probabilityMovedG; + + const List* SK_list = this->input_data.SK; + for(uint32_t i = 0; i < SK_list->length(); i++) + { + //NumericMatrix tmp = Rcpp::as<NumericMatrix>(*this->G[i]); + *this->G_optim[i] = *this->G[i]; + } + } } + NumericVector callScoreChange(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) { List& adjs = this->G_to_list();