All is ready except for final scoring -> Overflow of BGe score

f6f41d01 · Anuarbekov, Alikhan · f861ec0e · f6f41d01 · f6f41d01
Commit f6f41d01 authored 1 year ago by Anuarbekov, Alikhan
--- a/source/skeleton.R
+++ b/source/skeleton.R
@@ -200,15 +200,17 @@ P_D_G <- function(G_adjs, param_in)
    pDG1 <- pDG1 + BGe_compute(adj, param_in)
  }
  
-  nodes_cand <- rownames(G_adjs[[1]])
-  pDG2 <- sum(sapply(nodes_cand, function(node)
-  {
-    score <- BGe_node_compute(node, integer(0), param_in)
-    return(score)
-  })
-  )
-  
-  return(pDG1 + pDG2)
+  # They are still the same!
+  # nodes_cand <- rownames(G_adjs[[1]])
+  # pDG2 <- sum(sapply(nodes_cand, function(node)
+  # {
+  #   score <- BGe_node_compute(node, integer(0), param_in)
+  #   return(score)
+  # })
+  # )
+  # + pDG2
+  
+  return(pDG1)
 }

 #'
@@ -378,7 +380,7 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be
  # this is a large computation that computes a large parameter matrix:
  # N x N, where N is the sum of sizes, e.g., total number of genes
  cat("Computing large BGe matrix...\n")
-  if(FALSE)
+  if(TRUE)
  {
    myScore <- scoreparameters_BiDAG_BGe(n = ncol(GE), data = GE)
    param_in <- list(n=myScore$n, awpN=myScore$awpN, TN=myScore$TN, scoreconstvec=myScore$scoreconstvec)
@@ -394,10 +396,18 @@ SK_SS_tripartite <- function(GE, PK, SK, parts, I=1, debug=FALSE, alpha=0.05, be

  set.seed(0)
  
+  start.time <- Sys.time()
+  print(start.time)
+  
  E_f_G_out <- c_SK_SS(PK_in,SK_in, sizes, param_in,
                      I, debug, beta_L, beta_H,
                      score_G, score_change_G)
  
+  
+  end.time <- Sys.time()
+  print(end.time)
+  print(end.time - start.time)
+  
  print(E_f_G_out)
  
  E_f_G_out <- lapply(E_f_G_out, function(m) m > alpha)

--- a/source/skeleton.cpp
+++ b/source/skeleton.cpp
@@ -8,6 +8,7 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#include <map>

 // Pure C99
 #include <cstring>
@@ -67,9 +68,16 @@ struct key_hash : public std::unary_function<key_t, std::size_t>
 {
  std::size_t operator()(const key_t& k) const
  {
-    return (std::size_t)(std::get<0>(k)) ^ 
-           (std::size_t)(std::get<1>(k)) ^ 
-           (std::size_t)(std::get<2>(k));
+    std::size_t ret = 0;
+    ret ^= std::hash<uint32_t>()(std::get<0>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2);
+    ret ^= std::hash<uint32_t>()(std::get<1>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2);
+    ret ^= std::hash<uint32_t>()(std::get<2>(k)) + 0x9e3779b9 + (ret<<6) + (ret>>2);
+    
+    return ret;
+    
+    // return (std::size_t)(std::get<0>(k)) ^ 
+    //        (std::size_t)(std::get<1>(k)) ^ 
+    //        (std::size_t)(std::get<2>(k));
  }
 };

@@ -118,6 +126,14 @@ struct GraphChange
   */
  GraphChange* prev = nullptr;
  
+  
+  /**
+   * Note: since unordered_map does not have any guarantee about the key order,
+   *    the random sampling is modified by the changing key ordering!
+   *    
+   * If such behavior is unacceptable, consider using std::map, but this
+   *   will cost a change of complexity from O(1) to O(log n)
+   */
  std::unordered_map<
    const key_t, 
    GraphChange*,
@@ -157,7 +173,7 @@ struct GraphChange
   *        ! Note -> nodes with * haven't been visited !
   *            
   *  > if no flag is used to determine the node in Trie, we can't mark the above
-   *  > when pushing a word, only its last tail will be marked
+   *  > when pushing a word, only its last tail node will be marked
   */
  bool is_graph = false;
  
@@ -214,6 +230,11 @@ struct GraphChange
    
    return prev;
  }
+  
+  void printG()
+  {
+    Rcout << listID << "," << from << "," << to << "; " << is_graph << "\n";
+  }
 };

 struct graph_comp
@@ -238,7 +259,7 @@ public:
  
  ~Trie()
  {
-    
+    // TODO free, but not really needed for single script :)
  }
  
  
@@ -250,6 +271,12 @@ public:
  {
    if(word.size() == 0) return false;
    
+    /*
+    Rcout << "adding a word:\n";
+    for(key_t k : word)
+      Rcout << "["<< (uint32_t)std::get<0>(k) << "," << (uint32_t)std::get<1>(k) << "," << (uint32_t)std::get<2>(k) << "]";
+    Rcout<<"\n";
+    */

    // Now it is easier to add a word to trie:
    // since characters are sorted, just traverse and add individual nodes
@@ -257,6 +284,7 @@ public:
    if(this->root.find(word[0]) == this->root.end())
    {
      current = new GraphChange(word[0], nullptr);
+      root[word[0]] = current;
    }
    else{
      current = root.at(word[0]);
@@ -299,25 +327,55 @@ public:
    }
  }
  
-  
+  /**
+   * Randomly chooses a graph from the Trie and sets the G(adjacency) graph
+   * to it
+   */
  void sampleGraph(std::vector<LogicalMatrix*>& G_adjs)
  {
    uint32_t currIndex = 0;
    
    // TODO ideally the Rcpp::sample should be modified
    // to handle larger numbers
-    uint32_t sampledIndex = Rcpp::sample(INT_MAX, 1)[0]; 
+    uint32_t sampledIndex = Rcpp::sample(this->total_num_graphs, 1)[0] - 1; 
    
-    // TODO  
+    this->find(&currIndex, sampledIndex, G_adjs);
  }
  
-  // TODO maybe 32 bit won't be enough
+  // 
+  /**
+   * Returns a number of subgraphs in Trie. 
+   * 
+   * Note: this isn't the same as the number of nodes. We count only the subgraphs,
+   *       for more detail see the struct GraphChange
+   * 
+   * Note 2: maybe this 32-bit number won't be enough, consider using 64bit number,
+   *   but then, again, Rcpp::sample should be modified to 64-bit (2x sample + shift etc...)!
+   *   e.g., .Machine$integer.max on tested machine:= 2147483647
+   */
  uint32_t total_num()
  {
    return this->total_num_graphs;
  }
  
+  
+  void print()
+  {
+    
+    for(const auto & [ key, value ] : this->root)
+    {
+        this->printRecursive(value);
+    }
+  }
+  
 protected:
+  /**
+   * Note: since unordered_map does not have any guarantee about the key order,
+   *    the random sampling is modified by the changing key ordering!
+   *    
+   * If such behavior is unacceptable, consider using std::map, but this
+   *   will cost a change of complexity from O(1) to O(log n)
+   */
  std::unordered_map<
    const key_t, 
    GraphChange*,
@@ -325,22 +383,68 @@ protected:
  > 
  root;
  
-  // TODO assert total_num_graphs < INT_MAX
-  uint64_t total_num_graphs = 0;
+  
+  void printRecursive(GraphChange* curr)
+  {
+    Rcout << "recursive:\n";
+    curr->printG();
+    Rcout << "going down:\n";
+    for(const auto & [ key, value ] : curr->next_map)
+    {
+      this->printRecursive(value);
+    }
+    Rcout << "going up:\n";
+  }
+  
  
  
-  void addWordRecursive(std::vector<GraphChange*>& word, uint32_t index)
+  
+  
+  // TODO assert total_num_graphs < INT_MAX
+  uint32_t total_num_graphs = 0;
+  
+  void find(uint32_t* currIdx, const uint32_t& needed, std::vector<LogicalMatrix*>& G_adjs)
  {
-    key_t currKey = std::make_tuple(word[0]->listID, word[0]->from, word[0]->to);
-    if(root.find(currKey) == root.end())
+    // Note: if the sampled subgraphs are too deep and stack memory overflow,  
+    // then consider using the non-recursive version
+    for(const auto & [ key, value ] : this->root)
    {
-        // TODO
+      // Go down
+      value->forward(G_adjs);
+      *currIdx += value->is_graph;
+      
+      
+      this->recursiveFind(value, currIdx, needed, G_adjs);
+      if(*currIdx == needed)
+      {
+        break;
+      }
+      
+      value->backward(G_adjs);
    }
  }
  
-  void findIndex(uint64_t* currIdx, const uint64_t& needed)
+  
+  void recursiveFind(GraphChange* curr, uint32_t* currIdx, const uint32_t& needed, std::vector<LogicalMatrix*>& G_adjs)
  {
-    // TODO
+    if(*currIdx == needed)
+    {
+      return;
+    }
+    
+    curr->forward(G_adjs);
+    currIdx += curr->is_graph;
+    
+    for(const auto & [ key, value ] : curr->next_map)
+    {
+        this->recursiveFind(value, currIdx, needed, G_adjs);
+        if(*currIdx == needed)
+        {
+          return;
+        }
+    }
+    
+    curr->backward(G_adjs);
    return;
  }
  
@@ -354,11 +458,34 @@ protected:
 // [[Rcpp::export]]
 double tst()
 {
-  IntegerVector rnd = Rcpp::sample(6, 6);
-  NumericMatrix t = NumericMatrix(2, 3, rnd.begin());
-  Rcout << t << "\n";
+  Trie trie;
+  trie.print();
+  std::vector<key_t> word;
+
+  
+  word.push_back(std::make_tuple(0, 5, 10));
+  trie.appendChainAddOnly(word);
+  trie.print();
+  
+  word.push_back(std::make_tuple(0, 7, 10));
+  trie.appendChainAddOnly(word);
+  trie.print();
+  
+  word.push_back(std::make_tuple(0, 5, 11));
+  trie.appendChainAddOnly(word);
+  trie.print();
+  
+  word.push_back(std::make_tuple(0, 4, 10));
+  trie.appendChainAddOnly(word);
+  trie.print();
+  
+  word.erase(word.begin() + 0);
+  trie.appendChainAddOnly(word);
+  trie.print();
+  
+  Rcout << "Final trie size: " << trie.total_num() << "\n";
+  
  
-  Rcout << t * 2.5 << "\n";
  return 0.0;
 }

@@ -414,6 +541,16 @@ protected:
  std::vector<NumericMatrix*> E_f_G;
  std::vector<NumericMatrix*> current_counts;
  double currentProb = 1.0; // TODO
+  NumericVector initialProbs;
+  
+  
+  double bge_max = -99999999999.0;
+  double bge_min = 9999999999.0;
+  double energy_max = -99999999999.0;
+  double energy_min = 9999999999.0;
+  
+  double prob_max = -99999999999.0;
+  double prob_min = 9999999999.0;
  
 public:  
  Network(IData input, ISettings settings)
@@ -443,6 +580,7 @@ public:
  void SK_SS()
  {
    this->partialSK_SS();
+    return; // TODO remove
    
    for(uint32_t i = 0; i < this->settings.I; i++)
    {
@@ -454,12 +592,17 @@ public:
  
  List& result()
  {
+    Rprintf("BGe min/max: %.10f, %.10f\n", this->bge_min, this->bge_max);
+    Rprintf("Energy min/max: %.10f, %.10f\n", this->energy_min, this->energy_max);
+    Rprintf("Prob min/max: %.10f, %.10f\n", this->prob_min, this->prob_max);
+    Rprintf("Initial probs: %.10f, %.10f\n", this->initialProbs[0], this->initialProbs[1]);
+    
    const List* SK_list = this->input_data.SK;
    List* ret = new List(SK_list->length());

    for(uint32_t i = 0; i < SK_list->length(); i++)
    {
-      *this->E_f_G[i] = (*this->E_f_G[i]) / (this->V_G.total_num() / 100.0);
+      *this->E_f_G[i] = (*this->E_f_G[i]); //  / (this->V_G.total_num() / 100.0)
      (*ret)[i] = *this->E_f_G[i];
    }
    
@@ -469,22 +612,59 @@ public:
 protected:  
  void SK_SS_phase2_iter()
  {
-    //TODO
+    this->V_G.sampleGraph(this->G); // sample adjacency matrix, e.g., sample G_0
+    NumericVector twoScores = this->callScoreFull();
+    
+    
+    const List* SK_list = this->input_data.SK;
+    
+    // for loop over 1...N
+    // for loop over neighbors
+    // try add edge
+    for(uint32_t i = 0; i < SK_list->length(); i++)
+    {
+      const IntegerMatrix& sk = (*SK_list)[i];
+      
+      // Random permutation of the indices
+      uint32_t num = sk.ncol() * sk.nrow();
+      IntegerVector xy = Rcpp::sample(num, num);
+      
+      // It is necessary to do 2d loop in COL, ROW order, since
+      // rows are local in memory by Rcpp standard
+      // UPD: after random permutation this doesn't matter
+      for(uint32_t j = 0; j < (uint32_t)sk.ncol(); j++)
+      {
+        for(uint32_t k = 0; k < (uint32_t)sk.nrow(); k++)
+        {
+          
+          // extract 2D coords from 1D random permutation
+          uint32_t lin = xy[j * sk.nrow() + k] - 1;
+          uint32_t r = lin % sk.nrow();
+          uint32_t c = lin / sk.nrow();
+          
+          Rcout << "\014" << 
+            i << "," << j << "," << k << " /" << 
+              (SK_list->length()) << "," << (uint32_t)sk.ncol() << "," << (uint32_t)sk.nrow() <<
+                "\n" << "Trie size: " << this->V_G.total_num(); 
+          
+          // if inside G_0 -> skip
+          if((*this->G[i])(r,c) != 0)      
+            continue;
+          
+          // try add an edge outside of G_0
+          this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD);
+        }
+      }
+    }
  }
  
  
  void partialSK_SS_iter()
  {
-    List& adjs = this->G_to_list();
-    NumericVector twoScores = (*this->input_data.score_full_func)(
-                       adjs, 
-                       *this->input_data.PK,
-                       this->settings.beta_L,
-                       this->settings.beta_H,
-                       *this->input_data.bge_param
-    );
-    delete &adjs;
-
+    NumericVector twoScores = this->callScoreFull();
+    this->initialProbs = twoScores;
+    
+    
    const List* SK_list = this->input_data.SK;

    // for loop over 1...N
@@ -511,12 +691,12 @@ protected:
            uint32_t r = lin % sk.nrow();
            uint32_t c = lin / sk.nrow();
            
-            
-            //"\014" << 
-            Rcout << lin << " = " <<
-              i << "," << r << "," << c << " /" << 
+            Rcout << "\014" << 
+              i << "," << j << "," << k << " /" << 
                (SK_list->length()) << "," << (uint32_t)sk.ncol() << "," << (uint32_t)sk.nrow() <<
-                  "\n"; 
+                  "\n" << "Trie size: " << this->V_G.total_num() << "\n" 
+                       << "BGe: max:" << this->bge_max << ", min:" << this->bge_min << "\n"
+                       << "prob: " << this->currentProb << "\n"; 
            

            if(sk(r,c) == 0)      
@@ -526,7 +706,9 @@ protected:
          }
       }
    }
-
+    
+    return; // TODO remove
+    
    // for loop over 1...N
    // for loop over neighbors
    // try remove/add edge
@@ -551,6 +733,13 @@ protected:
          uint32_t r = lin % gz.nrow();
          uint32_t c = lin / gz.nrow();
          
+          
+          Rcout << "\014" << 
+            i << "," << j << "," << k << " /" << 
+              (SK_list->length()) << "," << (uint32_t)gz.ncol() << "," << (uint32_t)gz.nrow() <<
+                "\n" << "Trie size: " << this->V_G.total_num(); 
+          
+          
          if(gz(r,c) == 0)  // if not in G -> try add
            this->tryPerformMove(&twoScores, i, r, c, ChangeType::ADD); 
          else              // if in G -> try delete
@@ -562,7 +751,7 @@ protected:
    // Normalize counts by 100 * iteration number
    for(uint32_t i = 0; i < SK_list->length(); i++)
    {
-      *this->current_counts[i] = (*this->current_counts[i]) / (this->settings.I * 100.0);
+      *this->current_counts[i] = (*this->current_counts[i]); //  / (this->settings.I * 100.0)
      *this->E_f_G[i] += *this->current_counts[i];
    }
  }
@@ -650,6 +839,7 @@ protected:

  void tryPerformMove(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action)
  {
+    /*
    Rcout << "Trying to ";
    if(action == ChangeType::ADD)
    {
@@ -659,24 +849,19 @@ protected:
      Rcout << "REMOVE";
    }
    Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; 
+    */
+    NumericVector scoreChanges = this->callScoreChange(score, listID, from, to, action);
    
-    List& adjs = this->G_to_list();
-    NumericVector scoreChanges = (*this->input_data.score_change_func)
-    (
-        adjs, 
-        *this->input_data.PK, 
-        this->settings.beta_L, 
-        this->settings.beta_H, 
-        *this->input_data.bge_param,
-        *this->input_data.sizes,
-        *score, 
-        listID,
-        from, 
-        to, 
-        action == ChangeType::ADD);
+    double bge = scoreChanges[1];
+    double energy = scoreChanges[0];
+    double prob = bge + energy;
    
-    delete &adjs;
-
+    if(bge > this->bge_max) this->bge_max = bge;
+    if(bge < this->bge_min) this->bge_min = bge;
+    if(energy > this->energy_max) this->energy_max = energy;
+    if(energy < this->energy_min) this->energy_min = energy;
+    if(prob > this->prob_max) this->prob_max = prob;
+    if(prob < this->prob_min) this->prob_min = prob;
    /*
     * Here I tested multiple approaches to find the difference, because:
     * We are dealing with ~  -1e+08 or even more, thus
@@ -686,7 +871,7 @@ protected:
     * in each scoring function. For reference see  <score_change(...)> R function
     */
    double mod_diff = (scoreChanges[3] + scoreChanges[4]);
-    Rcout << mod_diff << "\n";
+    // Rcout << mod_diff << "\n";
    
    /**
     * Modified(explicit per-partes) is used, e.g.:
@@ -711,7 +896,11 @@ protected:
    
    this->makeMove(listID, from, to, action);
    
+    // Is it appropriate?
+    this->currentProb *= diff;
+    
    // TODO remove debug
+    /*
    if(action == ChangeType::ADD)
    {
      Rcout << "Adding";
@@ -719,7 +908,7 @@ protected:
    else{
      Rcout << "Removing";
    }
-    Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; 
+    Rcout << " edge: " << listID << ", " << from << ", " << to << "\n"; */
  }
  
  
@@ -756,6 +945,44 @@ protected:
    }
  }
  
+  
+  NumericVector callScoreChange(NumericVector* score, uint32_t listID, uint32_t from, uint32_t to, ChangeType action)
+  {
+    List& adjs = this->G_to_list();
+    NumericVector scoreChanges = (*this->input_data.score_change_func)
+      (
+          adjs, 
+          *this->input_data.PK, 
+          this->settings.beta_L, 
+          this->settings.beta_H, 
+          *this->input_data.bge_param,
+          *this->input_data.sizes,
+          *score, 
+          listID,
+          from, 
+          to, 
+          action == ChangeType::ADD);
+    
+    delete &adjs;
+    return scoreChanges;
+  }
+  
+  NumericVector callScoreFull()
+  {
+    List& adjs = this->G_to_list();
+    NumericVector twoScores = (*this->input_data.score_full_func)(
+      adjs, 
+      *this->input_data.PK,
+      this->settings.beta_L,
+      this->settings.beta_H,
+      *this->input_data.bge_param
+    );
+    delete &adjs;  
+    return twoScores;
+  }
+  
+  
+  
 };

 /**
@@ -814,6 +1041,9 @@ List c_SK_SS(
  
  net->SK_SS();
  List ret = net->result();
+  
+  
+
  delete net;
  
  return ret;