diff --git a/source/skeleton.R b/source/skeleton.R index 46423d08b57a27ad05a036f88d125e696540d68c..313a6d48096f9637765554702227aa6939f08013 100644 --- a/source/skeleton.R +++ b/source/skeleton.R @@ -352,24 +352,11 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be # Extract every partition's GE,PK,SK parts - GE_in <- list() PK_in <- list() SK_in <- list() - for(i in 1:length(parts)) + for(i in 2:length(parts)) { - # GE - subGE <- GE[, parts[[i]]] - subGE <- as.matrix(subGE) - colnames(subGE) <- parts[[i]] - GE_in[[i]] <- subGE - remove(subGE) - - # interactions are 1 less, e.g., for K parts there are K-1 interaction parts - if(i == 1){ - next - } - # PK subPK <- PK[parts[[i-1]],parts[[i]]] subPK <- as.matrix(subPK) @@ -410,7 +397,7 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be set.seed(0) - return(c_SK_SS(GE_in,PK_in,SK_in, sizes, param_in, + return(c_SK_SS(PK_in,SK_in, sizes, param_in, I, debug, alpha, beta_L, beta_H, score_G, score_change_G)) } diff --git a/source/skeleton.cpp b/source/skeleton.cpp index ccd6883cf4cd1815cd2a61bb17c359a7517ee12b..593efa401f4a7487c1fcbae517130b9bf727d123 100644 --- a/source/skeleton.cpp +++ b/source/skeleton.cpp @@ -2,19 +2,31 @@ * This part will be the header file declarations in package, but <Rcpp::sourceCpp()> requires single file to work, so it is here for now */ +// Pure C++ #include <numeric> #include <bitset> -#include <cstring> #include <string> #include <vector> +#include <algorithm> + +// Pure C99 +#include <cstring> #include <cmath> +#include <limits.h> +/** + * We include the external library of RcppArmadillo, + * this allows for a future usage of linear algebra, advanced Rcpp, etc... + */ // [[Rcpp::depends(RcppArmadillo)]] #include <RcppArmadillo.h> #include <RcppArmadilloExtensions/sample.h> //#include <Rcpp.h> +/** + * Most used Rcpp objects, used to + */ using Rcpp::Rcout; using Rcpp::Rcerr; using Rcpp::NumericVector; @@ -26,26 +38,70 @@ using Rcpp::IntegerMatrix; using Rcpp::List; using Rcpp::Function; using Rcpp::IntegerVector; +using Rcpp::_; -using std::bitset; - +/** + * Ideally should be included from library, but let it be like that :) + */ typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; /** - * This part will be the separate header file for tuple hash + Trie data structures + * Score computation in CPP: TODO? + */ + +/** + * This part will be the separate header file for tuple + hash + Trie data structures */ #include <tuple> typedef std::tuple<uint32_t, uint32_t, uint32_t> key_t; +/** + * Hashing of the triplet. Uses the simple XOR, + * TODO for future: replace with more sophisticated, possible errors + */ struct key_hash : public std::unary_function<key_t, std::size_t> { std::size_t operator()(const key_t& k) const { - return std::get<0>(k) ^ std::get<1>(k) ^ std::get<2>(k); + return (std::size_t)(std::get<0>(k)) ^ + (std::size_t)(std::get<1>(k)) ^ + (std::size_t)(std::get<2>(k)); } }; +struct GraphChange; + +struct key_comp +{ + /** + * Java/C++ like comparator function: + * + * Comparison is performed in the order: + * listID(0) -> to(2) -> from(1) + */ + inline bool operator() (const key_t& k1, const key_t& k2) + { + if(std::get<0>(k1) != std::get<0>(k2)) return std::get<0>(k1) < std::get<0>(k2); + if(std::get<2>(k1) != std::get<2>(k2)) return std::get<2>(k1) < std::get<2>(k2); + return std::get<1>(k1) < std::get<1>(k2); + } +}; + +void insert_sorted( std::vector<key_t> & vec, key_t const& item ) +{ + vec.insert + ( + std::upper_bound( vec.begin(), vec.end(), item, key_comp()), + item + ); +} +void erase_sorted(std::vector<key_t> & vec, key_t const& item ) +{ + vec.erase(std::remove(vec.begin(), vec.end(), item), vec.end()); +} + enum ChangeType { @@ -53,68 +109,244 @@ enum ChangeType DELETE = false }; - struct GraphChange { GraphChange* next = nullptr; + + /** + * Reference to parent for potential traversal + */ GraphChange* prev = nullptr; - // unordered_map< - // tuple<uint32_t, uint32_t, uint32_t>, - // std::vector<GraphChange*>, - // std::hash_tuple::hash<tuple<uint32_t, uint32_t, uint32_t>> - // > next; std::unordered_map< const key_t, GraphChange*, key_hash - > + > next_map; - - - ChangeType type; - + /** + * key_t, but divided to be more readable + */ uint32_t from; uint32_t to; uint32_t listID; - bool is_graph; - GraphChange* forward(List& adjs) + /** + * After Trie receives a word, only the end of it is actually a subgraph, + * + * Example to illustrate the intermediate subgraph mechanics: + * add 2 -> add 1 -> add 5 -> add 4 -> delete 2 -> add 3 + * + * then, the visited subgraphs are: + * {[2], [1, 2], [1, 2, 5], [1, 2, 4, 5], [1, 4, 5], [1, 3, 4, 5]} + * + * + * which results in Trie: + * + * 2 + * root---| 5 + * | 2--| + * 1*--| 4*---5 + * | + * 4*--5 + * | + * 3*--4*--5 + * + * + * ! Note -> nodes with * haven't been visited ! + * + * > if no flag is used to determine the node in Trie, we can't mark the above + * > when pushing a word, only its last tail will be marked + */ + bool is_graph = false; + + void mark() + { + this->is_graph = true; + } + + void connect(GraphChange* parent, GraphChange* children) { - //LogicalMatrix& mat = Rcpp::as<LogicalMatrix>(adjs[listID]); - //adjs[listID](from, to) = type; + this->prev = parent; + if(children != nullptr) + { + // TODO assert children->prev == parent + this->next_map[children->key()] = children; + children->prev = this; + } + } + + key_t key() + { + return std::make_tuple(this->listID, this->from, this->to); + } + + + + GraphChange(key_t kt, GraphChange* parent) + { + this->listID = std::get<0>(kt); + this->from = std::get<1>(kt); + this->to = std::get<2>(kt); + + this->is_graph = false; - //const IntegerMatrix& sk = adjs[listID]; + this->prev = parent; + } + + /** + * When going down the node, we add the edge, storing 1 ADJ matrix only + */ + GraphChange* forward(std::vector<LogicalMatrix*>& adjs) + { + (*adjs[listID])(from, to) = true; return next; } - GraphChange* backward(List& adjs) + + /** + * When going up the node, we remove the edge, storing 1 ADJ matrix only + */ + GraphChange* backward(std::vector<LogicalMatrix*>& adjs) { - //LogicalMatrix& mat = Rcpp::as<LogicalMatrix>(adjs[listID]); - //mat(from, to) = 1-type; - //adjs[listID](from, to) = 1-type; - //const IntegerMatrix& sk = adjs[listID]; + (*adjs[listID])(from, to) = false; return prev; } +}; + +struct graph_comp +{ + inline bool operator() (GraphChange*& k1, GraphChange*& k2) + { + key_t t1 = std::make_tuple(k1->listID, k1->from, k1->to); + key_t t2 = std::make_tuple(k2->listID, k2->from, k2->to); + + return key_comp()(t1, t2); + } +}; + + +class Trie +{ +public: + Trie() + { + + } + + ~Trie() + { + + } + + + /** + * Append a word consisting of ADD only nodes + */ + void appendChainAddOnly(std::vector<key_t>& word) + { + if(word.size() == 0) return; + + + // Now it is easier to add a word to trie: + // since characters are sorted, just traverse and add individual nodes + GraphChange* current = nullptr; + if(this->root.find(word[0]) == this->root.end()) + { + current = new GraphChange(word[0], nullptr); + } + else{ + current = root.at(word[0]); + } + uint32_t index = 1; + + while(true) + { + // The entire word is in Trie + if(index == word.size()) + { + // update number of graphs, if subgraph wasn't visited -> +1 + this->total_num_graphs += !(current->is_graph); + current->mark(); // mark the node as graph + break; // stop + } + + if(current->next_map.find(word[index]) == current->next_map.end()) + { + // Create and append all the missing nodes + current->next_map[word[index]] = new GraphChange(word[index], current); + for(;index < word.size();index++) + { + current->next_map[word[index]] = new GraphChange(word[index], current); + current = current->next_map[word[index]]; + } + + // mark the last one only, for more details see GraphDeta + // update number of graphs, if subgraph wasn't visited -> +1 + this->total_num_graphs += !(current->is_graph); + current->mark(); + break; + } + + // Go down the Trie + current = current->next_map[word[index]]; + index++; + } + } + void sampleGraph(std::vector<LogicalMatrix*>& G_adjs) + { + uint32_t currIndex = 0; + + // TODO ideally the Rcpp::sample should be modified + // to handle larger numbers + uint32_t sampledIndex = Rcpp::sample(INT_MAX, 1)[0]; + + // TODO + } +protected: + std::unordered_map< + const key_t, + GraphChange*, + key_hash + > + root; + // TODO assert total_num_graphs < INT_MAX + uint64_t total_num_graphs = 0; + void addWordRecursive(std::vector<GraphChange*>& word, uint32_t index) + { + key_t currKey = std::make_tuple(word[0]->listID, word[0]->from, word[0]->to); + if(root.find(currKey) == root.end()) + { + // TODO + } + } + + void findIndex(uint64_t* currIdx, const uint64_t& needed) + { + // TODO + return; + } + }; + // [[Rcpp::export]] double tst() { - + return 0.0; } @@ -122,7 +354,6 @@ double tst() struct IData { - const List* GE; const List* PK; const List* SK; const NumericVector* sizes; @@ -144,15 +375,24 @@ class Network { protected: uint32_t N; // number nodes - uint32_t M; // number samples IData input_data; ISettings settings; + Trie V_G; - NumericMatrix counts; - NumericMatrix current_counts; - std::vector<GraphChange*> V_G; + std::vector<NumericMatrix*> counts; + std::vector<NumericMatrix*> current_counts; + std::vector<LogicalMatrix*> G; + + //std::vector<GraphChange*> curr_V_G; + + /** + * sort individual elements from lowest to largest by unordered_set + * this will be used to push to TRIE -> count subgraphs + */ + std::vector<key_t> ord_V_G; + public: Network(IData input, ISettings settings) @@ -161,30 +401,12 @@ public: this->settings = settings; this->N = Rcpp::sum(*input.sizes); - NumericMatrix firstGE = ((*input.GE)[0]); - - this->M = firstGE.nrow(); - - //this->ADJ = new Bitset2D(SK.ncol(),SK.nrow()); - - Rcout << "Network: N=" << N << ", M=" << M << "\n"; + this->initializeLists(); } ~Network() { - if(V_G.size() == 0) - { - return; - } - - GraphChange* curr = V_G[V_G.size() - 1]; - while(curr != nullptr) - { - GraphChange* prev = curr; - curr = curr->prev; - delete prev; - } - V_G.clear(); + this->deleteLists(); } void partialSK_SS() @@ -192,38 +414,44 @@ public: for(uint32_t i = 0; i < this->settings.I; i++) { Rcout << "partialSK-SS, Iteration: " << i << "\n"; - this->partialSK_SS_phase1_iter(); - - - + this->emptyCurrCountsAndG(); + this->partialSK_SS_iter(); } } + void SK_SS() + { + this->partialSK_SS(); + + for(uint32_t i = 0; i < this->settings.I; i++) + { + Rcout << "SK-SS, Iteration: " << i << "\n"; + this->emptyCurrCountsAndG(); + this->SK_SS_phase2_iter(); + } + } + + protected: - GraphChange* createGraphChange(GraphChange* prev, ChangeType action, uint32_t from, uint32_t to) + void SK_SS_phase2_iter() { - GraphChange* ret = new GraphChange; - ret->prev = prev; - ret->type = action; - ret->next = nullptr; - ret->from = from; - ret->to = to; - - return ret; + //TODO } - void partialSK_SS_phase1_iter() + + void partialSK_SS_iter() { - List& G = this->emptyG_0(); - + //List& G = this->emptyG_0(); // TODO remove? No, but used matrices from curr_V_G! + /* NumericVector twoScores = (*this->input_data.score_full_func)( G, *this->input_data.PK, this->settings.beta_L, this->settings.beta_H, *this->input_data.bge_param - ); + );*/ //TODO + NumericVector twoScores = NumericVector(2); //Rcout << twoScores << "\n"; @@ -243,43 +471,45 @@ protected: // Random permutation of the indices uint32_t num = sk.ncol() * sk.nrow(); IntegerVector xy = Rcpp::sample(num, num); - + + xy = Rcpp::Range(0, num-1); // TODO remove + Rcout << xy << "\n"; // It is necessary to do 2d loop in COL, ROW order, since // rows are local in memory by Rcpp standard + // UPD: after random permutation this doesn't matter for(uint32_t j = 0; j < (uint32_t)sk.ncol(); j++) { - //Rcout << "\rSituation: " << i << ", " << j << "\n"; for(uint32_t k = 0; k < (uint32_t)sk.nrow(); k++) { - // extract 2D coords from 1D sample - uint32_t lin = xy[j * sk.nrow() + k] - 1; + + // extract 2D coords from 1D random permutation + uint32_t lin = xy[j * sk.nrow() + k]; uint32_t r = lin % sk.nrow(); uint32_t c = lin / sk.nrow(); - if(sk(r,c) == 0) // <<< this chunk, but written without if - continue; - - - this->tryPerformMove(G, &twoScores, i, r, c, ChangeType::ADD); - //"\014" << Rcout << - i << "," << j << "," << k << " /" << + i << "," << r << "," << c << " /" << (SK_list->length()) << "," << (uint32_t)sk.ncol() << "," << (uint32_t)sk.nrow() << - "\n"; + "\n"; + + + if(sk(r,c) == 0) + continue; + + this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD); } } } - return; - + // for loop over 1...N // for loop over neighbors // try remove/add edge for(uint32_t i = 0; i < SK_list->length(); i++) { - const IntegerMatrix& gz = (*SK_list)[i]; + LogicalMatrix& gz = *(this->G[i]); // Random permutation of the indices uint32_t num = gz.ncol() * gz.nrow(); @@ -288,30 +518,27 @@ protected: // It is necessary to do 2d loop in COL, ROW order, since // rows are local in memory by Rcpp standard + // UPD: after random permutation this doesn't matter for(uint32_t j = 0; j < (uint32_t)gz.ncol(); j++) { - //Rcout << "\rSituation: " << i << ", " << j << "\n"; for(uint32_t k = 0; k < (uint32_t)gz.nrow(); k++) { - // extract 2D coords from 1D sample + // extract 2D coords from 1D random permutation uint32_t lin = xy[j * gz.nrow() + k] - 1; uint32_t r = lin % gz.nrow(); uint32_t c = lin / gz.nrow(); - - if(gz(r,c) == 0) // <<< this chunk, but written without if - this->tryPerformMove(G, &twoScores, i, r, c, ChangeType::ADD); - else - this->tryPerformMove(G, &twoScores, i, r, c, ChangeType::DELETE); + if(gz(r,c) == 0) // if not in G -> try add + this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD); + else // if in G -> try delete + this->tryPerformMove(&twoScores, i, r, c, ChangeType::DELETE); } } } - - delete &G; } - List& emptyG_0() + List& G_to_list() { const List* SK_list = this->input_data.SK; List* ret = new List(SK_list->length()); @@ -319,17 +546,81 @@ protected: for(uint32_t i = 0; i < SK_list->length(); i++) { const IntegerMatrix& sk = (*SK_list)[i]; - (*ret)[i] = LogicalMatrix(sk.nrow(), sk.ncol()); - - Rcpp::rownames((*ret)[i]) = Rcpp::rownames(sk); - Rcpp::colnames((*ret)[i]) = Rcpp::colnames(sk); + (*ret)[i] = *this->G[i]; } return *ret; } + void emptyCurrCountsAndG() + { + const List* SK_list = this->input_data.SK; + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + const IntegerMatrix& sk = (*SK_list)[i]; + + //delete this->G[i]; + *(this->G)[i] = LogicalMatrix(sk.nrow(), sk.ncol()); + Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); + + //delete this->current_counts[i]; + *(this->current_counts)[i] = NumericMatrix(sk.nrow(), sk.ncol()); + Rcpp::rownames(*(this->current_counts)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->current_counts)[i]) = Rcpp::colnames(sk); + } + } + + /** + * We should convert Rcpp lists to vectors of matrices because: + * > we can't easily manipulate Rcpp List objects + * > with matrix pointers it is easier to manage memory manually + * > BUT! we should still use matrices to call R score functions + */ + void initializeLists() + { + const List* SK_list = this->input_data.SK; + + for(uint32_t i = 0; i < SK_list->length(); i++) + { + const IntegerMatrix& sk = (*SK_list)[i]; + + (this->G).push_back(new LogicalMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->G)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->G)[i]) = Rcpp::colnames(sk); + + (this->counts).push_back(new NumericMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->counts)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->counts)[i]) = Rcpp::colnames(sk); + + (this->current_counts).push_back(new NumericMatrix(sk.nrow(), sk.ncol())); + Rcpp::rownames(*(this->current_counts)[i]) = Rcpp::rownames(sk); + Rcpp::colnames(*(this->current_counts)[i]) = Rcpp::colnames(sk); + } + + } + + /** + * Clear matrices in vectors + */ + void deleteLists() + { + const List* SK_list = this->input_data.SK; + for(uint32_t i = 0; i < SK_list->length(); i++) + { + delete this->G[i]; + delete this->counts[i]; + delete this->current_counts[i]; + } + + this->G.clear(); + this->counts.clear(); + this->current_counts.clear(); + } - void tryPerformMove(List& adjs, NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) + + void tryPerformMove(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) { Rcout << "Trying to "; if(action == ChangeType::ADD) @@ -339,13 +630,9 @@ protected: else{ Rcout << "REMOVE"; } - Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; - - - - + List& adjs = this->G_to_list(); NumericVector scoreChanges = (*this->input_data.score_change_func) ( adjs, @@ -360,24 +647,33 @@ protected: to, action == ChangeType::ADD); + delete &adjs; /* - * Here I test 3 approaches to find the difference, because: - * We are dealing with 1e-20 and more precision, thus - * straightforward computation may overflow/underflow + * Here I tested multiple approaches to find the difference, because: + * We are dealing with ~ -1e+08 or even more, thus + * straightforward computation may overflow/underflow(when exp() is taken) */ - double per_part_diff = (scoreChanges[0] - (*score)[0]) + (scoreChanges[1] - (*score)[1]); + //double per_part_diff = (scoreChanges[0] - (*score)[0]) + (scoreChanges[1] - (*score)[1]); - double log_score_new = scoreChanges[0] + scoreChanges[1]; - double log_score_old = (*score)[0] + (*score)[1]; - double diff = log_score_new - log_score_old; + // double log_score_new = scoreChanges[0] + scoreChanges[1]; + // double log_score_old = (*score)[0] + (*score)[1]; + // double diff = log_score_new - log_score_old; double mod_diff = (scoreChanges[3] + scoreChanges[4]); Rcout << mod_diff << "\n"; // Modified(explicit per-partes) is used - diff = exp(mod_diff); + /** + * Modified(explicit per-partes) is used, e.g.: + * + * since we are working in small precisions + log scale: + * > difference is calculated in SCORE functions + * > difference is separated for BGe / Energy part + * > ONLY difference is converted by exp, since it is tractable! + */ + double diff = exp(mod_diff); NumericVector probs = {1.0, diff}; probs = probs / Rcpp::sum(probs); @@ -389,8 +685,10 @@ protected: *score = scoreChanges; - this->makeMove(adjs, listID, from, to, action); + this->makeMove(listID, from, to, action); + + // TODO remove debug if(action == ChangeType::ADD) { Rcout << "Adding"; @@ -398,13 +696,13 @@ protected: else{ Rcout << "Removing"; } - Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; } - void makeMove(List& adjs, uint32_t listID, uint32_t from, uint32_t to, ChangeType action) + void makeMove(uint32_t listID, uint32_t from, uint32_t to, ChangeType action) { + /* GraphChange* toAdd = new GraphChange; toAdd->from = from; toAdd->to = to; @@ -412,8 +710,27 @@ protected: toAdd->type = action; toAdd->is_graph = true; - toAdd->prev = this->V_G.empty() ? nullptr : this->V_G[this->V_G.size() - 1]; + toAdd->prev = this->curr_V_G.empty() ? nullptr : this->curr_V_G[this->curr_V_G.size() - 1]; toAdd->forward(adjs); + */ + + + // update adjacency + (*(this->G[listID]))(from,to) = action == ChangeType::ADD; + + + // Update our ordered add-only word + key_t kt = std::make_tuple(listID, from, to); + if(action == ChangeType::ADD) + { + insert_sorted(this->ord_V_G, kt); + //this->ord_V_G.insert(kt); + } + else{ + erase_sorted(this->ord_V_G, kt); + //this->ord_V_G.erase(this->ord_V_G.find(kt)); + } + this->V_G.appendChainAddOnly(this->ord_V_G); } @@ -421,18 +738,9 @@ protected: }; /** - * These two functions are the real implementation in C++... + * This function is the interface to the R part, the only function exported */ -// IntegerMatrix c_partialSK_SS( -// const List& GE, -// const List& PK, -// const List& SK, -// const NumericVector& sizes, -// uint32_t I, -// bool debug, -// double alpha); IntegerMatrix c_SK_SS( - const List& GE, const List& PK, const List& SK, const NumericVector& sizes, @@ -450,35 +758,8 @@ IntegerMatrix c_SK_SS( */ //#include "skeleton.h" - -// [[Rcpp::export]] -double ttt(const IntegerMatrix& SK) -{ - Rcout << SK.nrow() << "," << SK.ncol() << "\n"; - return 0.0; -} - - -IntegerMatrix c_partialSK_SS( - const List& GE, - const List& PK, - const List& SK, - const NumericVector& sizes, - uint32_t I, - bool debug, - double alpha) -{ - Rcout << "I:" << I << "debug:" << debug << "\n"; - Rcout << "I am in partialSK_SS function!\n"; - //Network* net = new Network(GE, SK); - //delete net; - - return IntegerMatrix(); -} - // [[Rcpp::export]] IntegerMatrix c_SK_SS( - const List& GE, const List& PK, const List& SK, const NumericVector& sizes, @@ -495,7 +776,6 @@ IntegerMatrix c_SK_SS( IData i_data; - i_data.GE = &GE; i_data.PK = &PK; i_data.SK = &SK; i_data.sizes = &sizes; @@ -513,7 +793,7 @@ IntegerMatrix c_SK_SS( Network* net = new Network(i_data, settings); - net->partialSK_SS(); + //net->partialSK_SS(); delete net;