diff --git a/source/skeleton.R b/source/skeleton.R index 00871b6b494f80c725907753accf08cd5e622f04..877ee23b443c8807c36f2aaa77066d7f83a02290 100644 --- a/source/skeleton.R +++ b/source/skeleton.R @@ -200,15 +200,17 @@ P_D_G <- function(G_adjs, param_in) pDG1 <- pDG1 + BGe_compute(adj, param_in) } - nodes_cand <- rownames(G_adjs[[1]]) - pDG2 <- sum(sapply(nodes_cand, function(node) - { - score <- BGe_node_compute(node, integer(0), param_in) - return(score) - }) - ) - - return(pDG1 + pDG2) + # They are still the same! + # nodes_cand <- rownames(G_adjs[[1]]) + # pDG2 <- sum(sapply(nodes_cand, function(node) + # { + # score <- BGe_node_compute(node, integer(0), param_in) + # return(score) + # }) + # ) + # + pDG2 + + return(pDG1) } #' @@ -378,7 +380,7 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be # this is a large computation that computes a large parameter matrix: # N x N, where N is the sum of sizes, e.g., total number of genes cat("Computing large BGe matrix...\n") - if(FALSE) + if(TRUE) { myScore <- scoreparameters_BiDAG_BGe(n = ncol(GE), data = GE) param_in <- list(n=myScore$n, awpN=myScore$awpN, TN=myScore$TN, scoreconstvec=myScore$scoreconstvec) @@ -394,10 +396,18 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be set.seed(0) + start.time <- Sys.time() + print(start.time) + E_f_G_out <- c_SK_SS(PK_in,SK_in, sizes, param_in, I, debug, beta_L, beta_H, score_G, score_change_G) + + end.time <- Sys.time() + print(end.time) + print(end.time - start.time) + print(E_f_G_out) E_f_G_out <- lapply(E_f_G_out, function(m) m > alpha) diff --git a/source/skeleton.cpp b/source/skeleton.cpp index 6b623982852988b534b62df9dcc019d71b47d3a8..062a41dc2b59808e22e221a06a7a2eb690055bc7 100644 --- a/source/skeleton.cpp +++ b/source/skeleton.cpp @@ -8,6 +8,7 @@ #include <string> #include <vector> #include <algorithm> +#include <map> // Pure C99 #include <cstring> @@ -67,9 +68,16 @@ struct key_hash : public std::unary_function<key_t, std::size_t> { std::size_t operator()(const key_t& k) const { - return (std::size_t)(std::get<0>(k)) ^ - (std::size_t)(std::get<1>(k)) ^ - (std::size_t)(std::get<2>(k)); + std::size_t ret = 0; + ret ^= std::hash<uint32_t>()(std::get<0>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2); + ret ^= std::hash<uint32_t>()(std::get<1>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2); + ret ^= std::hash<uint32_t>()(std::get<2>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2); + + return ret; + + // return (std::size_t)(std::get<0>(k)) ^ + // (std::size_t)(std::get<1>(k)) ^ + // (std::size_t)(std::get<2>(k)); } }; @@ -118,6 +126,14 @@ struct GraphChange */ GraphChange* prev = nullptr; + + /** + * Note: since unordered_map does not have any guarantee about the key order, + * the random sampling is modified by the changing key ordering! + * + * If such behavior is unacceptable, consider using std::map, but this + * will cost a change of complexity from O(1) to O(log n) + */ std::unordered_map< const key_t, GraphChange*, @@ -157,7 +173,7 @@ struct GraphChange * ! Note -> nodes with * haven't been visited ! * * > if no flag is used to determine the node in Trie, we can't mark the above - * > when pushing a word, only its last tail will be marked + * > when pushing a word, only its last tail node will be marked */ bool is_graph = false; @@ -214,6 +230,11 @@ struct GraphChange return prev; } + + void printG() + { + Rcout << listID << "," << from << "," << to << "; " << is_graph << "\n"; + } }; struct graph_comp @@ -238,7 +259,7 @@ public: ~Trie() { - + // TODO free, but not really needed for single script :) } @@ -250,6 +271,12 @@ public: { if(word.size() == 0) return false; + /* + Rcout << "adding a word:\n"; + for(key_t k : word) + Rcout << "["<< (uint32_t)std::get<0>(k) << "," << (uint32_t)std::get<1>(k) << "," << (uint32_t)std::get<2>(k) << "]"; + Rcout<<"\n"; + */ // Now it is easier to add a word to trie: // since characters are sorted, just traverse and add individual nodes @@ -257,6 +284,7 @@ public: if(this->root.find(word[0]) == this->root.end()) { current = new GraphChange(word[0], nullptr); + root[word[0]] = current; } else{ current = root.at(word[0]); @@ -299,25 +327,55 @@ public: } } - + /** + * Randomly chooses a graph from the Trie and sets the G(adjacency) graph + * to it + */ void sampleGraph(std::vector<LogicalMatrix*>& G_adjs) { uint32_t currIndex = 0; // TODO ideally the Rcpp::sample should be modified // to handle larger numbers - uint32_t sampledIndex = Rcpp::sample(INT_MAX, 1)[0]; + uint32_t sampledIndex = Rcpp::sample(this->total_num_graphs, 1)[0] - 1; - // TODO + this->find(&currIndex, sampledIndex, G_adjs); } - // TODO maybe 32 bit won't be enough + // + /** + * Returns a number of subgraphs in Trie. + * + * Note: this isn't the same as the number of nodes. We count only the subgraphs, + * for more detail see the struct GraphChange + * + * Note 2: maybe this 32-bit number won't be enough, consider using 64bit number, + * but then, again, Rcpp::sample should be modified to 64-bit (2x sample + shift etc...)! + * e.g., .Machine$integer.max on tested machine:= 2147483647 + */ uint32_t total_num() { return this->total_num_graphs; } + + void print() + { + + for(const auto & [ key, value ] : this->root) + { + this->printRecursive(value); + } + } + protected: + /** + * Note: since unordered_map does not have any guarantee about the key order, + * the random sampling is modified by the changing key ordering! + * + * If such behavior is unacceptable, consider using std::map, but this + * will cost a change of complexity from O(1) to O(log n) + */ std::unordered_map< const key_t, GraphChange*, @@ -325,22 +383,68 @@ protected: > root; - // TODO assert total_num_graphs < INT_MAX - uint64_t total_num_graphs = 0; + + void printRecursive(GraphChange* curr) + { + Rcout << "recursive:\n"; + curr->printG(); + Rcout << "going down:\n"; + for(const auto & [ key, value ] : curr->next_map) + { + this->printRecursive(value); + } + Rcout << "going up:\n"; + } + - void addWordRecursive(std::vector<GraphChange*>& word, uint32_t index) + + + // TODO assert total_num_graphs < INT_MAX + uint32_t total_num_graphs = 0; + + void find(uint32_t* currIdx, const uint32_t& needed, std::vector<LogicalMatrix*>& G_adjs) { - key_t currKey = std::make_tuple(word[0]->listID, word[0]->from, word[0]->to); - if(root.find(currKey) == root.end()) + // Note: if the sampled subgraphs are too deep and stack memory overflow, + // then consider using the non-recursive version + for(const auto & [ key, value ] : this->root) { - // TODO + // Go down + value->forward(G_adjs); + *currIdx += value->is_graph; + + + this->recursiveFind(value, currIdx, needed, G_adjs); + if(*currIdx == needed) + { + break; + } + + value->backward(G_adjs); } } - void findIndex(uint64_t* currIdx, const uint64_t& needed) + + void recursiveFind(GraphChange* curr, uint32_t* currIdx, const uint32_t& needed, std::vector<LogicalMatrix*>& G_adjs) { - // TODO + if(*currIdx == needed) + { + return; + } + + curr->forward(G_adjs); + currIdx += curr->is_graph; + + for(const auto & [ key, value ] : curr->next_map) + { + this->recursiveFind(value, currIdx, needed, G_adjs); + if(*currIdx == needed) + { + return; + } + } + + curr->backward(G_adjs); return; } @@ -354,11 +458,34 @@ protected: // [[Rcpp::export]] double tst() { - IntegerVector rnd = Rcpp::sample(6, 6); - NumericMatrix t = NumericMatrix(2, 3, rnd.begin()); - Rcout << t << "\n"; + Trie trie; + trie.print(); + std::vector<key_t> word; + + + word.push_back(std::make_tuple(0, 5, 10)); + trie.appendChainAddOnly(word); + trie.print(); + + word.push_back(std::make_tuple(0, 7, 10)); + trie.appendChainAddOnly(word); + trie.print(); + + word.push_back(std::make_tuple(0, 5, 11)); + trie.appendChainAddOnly(word); + trie.print(); + + word.push_back(std::make_tuple(0, 4, 10)); + trie.appendChainAddOnly(word); + trie.print(); + + word.erase(word.begin() + 0); + trie.appendChainAddOnly(word); + trie.print(); + + Rcout << "Final trie size: " << trie.total_num() << "\n"; + - Rcout << t * 2.5 << "\n"; return 0.0; } @@ -414,6 +541,16 @@ protected: std::vector<NumericMatrix*> E_f_G; std::vector<NumericMatrix*> current_counts; double currentProb = 1.0; // TODO + NumericVector initialProbs; + + + double bge_max = -99999999999.0; + double bge_min = 9999999999.0; + double energy_max = -99999999999.0; + double energy_min = 9999999999.0; + + double prob_max = -99999999999.0; + double prob_min = 9999999999.0; public: Network(IData input, ISettings settings) @@ -443,6 +580,7 @@ public: void SK_SS() { this->partialSK_SS(); + return; // TODO remove for(uint32_t i = 0; i < this->settings.I; i++) { @@ -454,12 +592,17 @@ public: List& result() { + Rprintf("BGe min/max: %.10f, %.10f\n", this->bge_min, this->bge_max); + Rprintf("Energy min/max: %.10f, %.10f\n", this->energy_min, this->energy_max); + Rprintf("Prob min/max: %.10f, %.10f\n", this->prob_min, this->prob_max); + Rprintf("Initial probs: %.10f, %.10f\n", this->initialProbs[0], this->initialProbs[1]); + const List* SK_list = this->input_data.SK; List* ret = new List(SK_list->length()); for(uint32_t i = 0; i < SK_list->length(); i++) { - *this->E_f_G[i] = (*this->E_f_G[i]) / (this->V_G.total_num() / 100.0); + *this->E_f_G[i] = (*this->E_f_G[i]); // / (this->V_G.total_num() / 100.0) (*ret)[i] = *this->E_f_G[i]; } @@ -469,22 +612,59 @@ public: protected: void SK_SS_phase2_iter() { - //TODO + this->V_G.sampleGraph(this->G); // sample adjacency matrix, e.g., sample G_0 + NumericVector twoScores = this->callScoreFull(); + + + const List* SK_list = this->input_data.SK; + + // for loop over 1...N + // for loop over neighbors + // try add edge + for(uint32_t i = 0; i < SK_list->length(); i++) + { + const IntegerMatrix& sk = (*SK_list)[i]; + + // Random permutation of the indices + uint32_t num = sk.ncol() * sk.nrow(); + IntegerVector xy = Rcpp::sample(num, num); + + // It is necessary to do 2d loop in COL, ROW order, since + // rows are local in memory by Rcpp standard + // UPD: after random permutation this doesn't matter + for(uint32_t j = 0; j < (uint32_t)sk.ncol(); j++) + { + for(uint32_t k = 0; k < (uint32_t)sk.nrow(); k++) + { + + // extract 2D coords from 1D random permutation + uint32_t lin = xy[j * sk.nrow() + k] - 1; + uint32_t r = lin % sk.nrow(); + uint32_t c = lin / sk.nrow(); + + Rcout << "\014" << + i << "," << j << "," << k << " /" << + (SK_list->length()) << "," << (uint32_t)sk.ncol() << "," << (uint32_t)sk.nrow() << + "\n" << "Trie size: " << this->V_G.total_num(); + + // if inside G_0 -> skip + if((*this->G[i])(r,c) != 0) + continue; + + // try add an edge outside of G_0 + this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD); + } + } + } } void partialSK_SS_iter() { - List& adjs = this->G_to_list(); - NumericVector twoScores = (*this->input_data.score_full_func)( - adjs, - *this->input_data.PK, - this->settings.beta_L, - this->settings.beta_H, - *this->input_data.bge_param - ); - delete &adjs; - + NumericVector twoScores = this->callScoreFull(); + this->initialProbs = twoScores; + + const List* SK_list = this->input_data.SK; // for loop over 1...N @@ -511,12 +691,12 @@ protected: uint32_t r = lin % sk.nrow(); uint32_t c = lin / sk.nrow(); - - //"\014" << - Rcout << lin << " = " << - i << "," << r << "," << c << " /" << + Rcout << "\014" << + i << "," << j << "," << k << " /" << (SK_list->length()) << "," << (uint32_t)sk.ncol() << "," << (uint32_t)sk.nrow() << - "\n"; + "\n" << "Trie size: " << this->V_G.total_num() << "\n" + << "BGe: max:" << this->bge_max << ", min:" << this->bge_min << "\n" + << "prob: " << this->currentProb << "\n"; if(sk(r,c) == 0) @@ -526,7 +706,9 @@ protected: } } } - + + return; // TODO remove + // for loop over 1...N // for loop over neighbors // try remove/add edge @@ -551,6 +733,13 @@ protected: uint32_t r = lin % gz.nrow(); uint32_t c = lin / gz.nrow(); + + Rcout << "\014" << + i << "," << j << "," << k << " /" << + (SK_list->length()) << "," << (uint32_t)gz.ncol() << "," << (uint32_t)gz.nrow() << + "\n" << "Trie size: " << this->V_G.total_num(); + + if(gz(r,c) == 0) // if not in G -> try add this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD); else // if in G -> try delete @@ -562,7 +751,7 @@ protected: // Normalize counts by 100 * iteration number for(uint32_t i = 0; i < SK_list->length(); i++) { - *this->current_counts[i] = (*this->current_counts[i]) / (this->settings.I * 100.0); + *this->current_counts[i] = (*this->current_counts[i]); // / (this->settings.I * 100.0) *this->E_f_G[i] += *this->current_counts[i]; } } @@ -650,6 +839,7 @@ protected: void tryPerformMove(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) { + /* Rcout << "Trying to "; if(action == ChangeType::ADD) { @@ -659,24 +849,19 @@ protected: Rcout << "REMOVE"; } Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; + */ + NumericVector scoreChanges = this->callScoreChange(score, listID, from, to, action); - List& adjs = this->G_to_list(); - NumericVector scoreChanges = (*this->input_data.score_change_func) - ( - adjs, - *this->input_data.PK, - this->settings.beta_L, - this->settings.beta_H, - *this->input_data.bge_param, - *this->input_data.sizes, - *score, - listID, - from, - to, - action == ChangeType::ADD); + double bge = scoreChanges[1]; + double energy = scoreChanges[0]; + double prob = bge + energy; - delete &adjs; - + if(bge > this->bge_max) this->bge_max = bge; + if(bge < this->bge_min) this->bge_min = bge; + if(energy > this->energy_max) this->energy_max = energy; + if(energy < this->energy_min) this->energy_min = energy; + if(prob > this->prob_max) this->prob_max = prob; + if(prob < this->prob_min) this->prob_min = prob; /* * Here I tested multiple approaches to find the difference, because: * We are dealing with ~ -1e+08 or even more, thus @@ -686,7 +871,7 @@ protected: * in each scoring function. For reference see <score_change(...)> R function */ double mod_diff = (scoreChanges[3] + scoreChanges[4]); - Rcout << mod_diff << "\n"; + // Rcout << mod_diff << "\n"; /** * Modified(explicit per-partes) is used, e.g.: @@ -711,7 +896,11 @@ protected: this->makeMove(listID, from, to, action); + // Is it appropriate? + this->currentProb *= diff; + // TODO remove debug + /* if(action == ChangeType::ADD) { Rcout << "Adding"; @@ -719,7 +908,7 @@ protected: else{ Rcout << "Removing"; } - Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; + Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; */ } @@ -756,6 +945,44 @@ protected: } } + + NumericVector callScoreChange(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) + { + List& adjs = this->G_to_list(); + NumericVector scoreChanges = (*this->input_data.score_change_func) + ( + adjs, + *this->input_data.PK, + this->settings.beta_L, + this->settings.beta_H, + *this->input_data.bge_param, + *this->input_data.sizes, + *score, + listID, + from, + to, + action == ChangeType::ADD); + + delete &adjs; + return scoreChanges; + } + + NumericVector callScoreFull() + { + List& adjs = this->G_to_list(); + NumericVector twoScores = (*this->input_data.score_full_func)( + adjs, + *this->input_data.PK, + this->settings.beta_L, + this->settings.beta_H, + *this->input_data.bge_param + ); + delete &adjs; + return twoScores; + } + + + }; /** @@ -814,6 +1041,9 @@ List c_SK_SS( net->SK_SS(); List ret = net->result(); + + + delete net; return ret;