modifications

MEMERIS modifications in the MEME 3.5.1. source code

meme.c
   - declaration a new function
           static void readSecondaryStructurePrior(
              DATASET *dataset,           /* the dataset */
              MODEL *model,              /* the model */
              char *secondaryStructureFilename,   /* input filename */
              double secondaryStructurePseudocount /* pseudocount for prior prob */
           );
   - define this function
           static void readSecondaryStructurePrior(
           ...
   - call readSecondaryStructurePrior() function if -secstruct is set
           if (dataset->secondaryStructureFilename != NULL) {
               printf("use secondary structure information from: %s\n", dataset->secondaryStructureFilename); fflush(stdout);
              readSecondaryStructurePrior(dataset, model, dataset->secondaryStructureFilename, dataset->secondaryStructurePseudocount);
              printf("done\n"); fflush(stdout);
           }
   - output the maximum adjustment of the pseudocount in case of TCM
      if ((dataset->secondaryStructureFilename != NULL) && (model->mtype = Tcm)) {
          printf("max adjustment of pseudocount: %f\n",MAXADJUST);
       }

meme.h
   - new global variables
          DEXTERN(double, MAXADJUST, 0);       /* maximum adjustment of the pseudocount -- for statistics */
          DEXTERN(int, MAX_NEWTON_ITERATIONS, 30);       /* maximum number of iterations for Newton-Raphson Method */
   - introduce new parameters for the secstruct filename and the pseudocount for dataset
          char *secondaryStructureFilename; /* name of the file with the secondary structure information */
          double secondaryStructurePseudocount; /* pseudocount added to each position to avoid a prior prob of 0 */
   - introduce variables that store the EF/PU values, the sigma values and LOG/INTLOG precomputations
          double *ss_value;   /* secondary structure value (EF or PU) = measure for single-strandedness of motif */
          double *sigma;       /* prior prob for Zij */
          double *log_sigma; /* LOG(sigma) */
          int *intlog_sigma; /* INTLOG(sigma) */
          double sum_ss_value;   /* \sum ss_value[i] */
          double max_ss_value;    /* max ss_value[i] (used for TCM) */



init.c
   - introduce two new static variables
           static char *secondaryStructureFilename = NULL;       /* if not NULL, the secondary structure information from this filename is used for the prior probability */
           static double secondaryStructurePseudocount = 0.1; /* pseudocount added to each position to avoid a prior prob of 0 */
   - read the two parameters -secstruct and -pi in init_meme()
            DATA_OPTN(1, secstruct, <filename>, \tuse secondary structure information from <filename> for the a priori probability, secondaryStructureFilename = _OPTION_);
        DATA_OPTN(1, pi, <double>, \t\tpseudocount added to each secondary structure prior (default 0.1), secondaryStructurePseudocount = atof(_OPTION_));
   - check if a motif length is given, check for DNA alphabet and check if -revcomp and -secstruct is set (if so disallow -revcomp)
          if (secondaryStructureFilename != NULL) {
            if (invcomp) {
              invcomp = FALSE;
               fprintf(stderr, "ERROR: you set 'revcomp' and use secondary structure information: revcomp setting is not used\n");
            }
           if (w == 0) {
               fprintf(stderr, "ERROR: you must set the motif length (-w) to the value used for the computation of the secondary structure values\n");
               exit(1);
           }
           if (strcmp(alph, "DNA") != 0) {
               alph = "DNA";
               fprintf(stderr, "ERROR: you set 'protein' and use secondary structure information: switch to DNA alphabet\n");
           }
          }
   - set model->w if only one w is allowed
          if (model->max_w == model->min_w)
            model->w = model->max_w;
   - store the filename in dataset->secondaryStructureFilename and the pseudocount dataset->secondaryStructurePseudocount
          dataset->secondaryStructureFilename = secondaryStructureFilename;
          dataset->secondaryStructurePseudocount = secondaryStructurePseudocount;

oops.c
   - declaration of functions for TCM M-step
          double NewtonRaphsonMethod(
           MODEL *model,           /* the model */
       DATASET *dataset       /* the dataset */
          );
          double computeFirstDerivative(
             MODEL *model,           /* the model */
       DATASET *dataset,       /* the dataset */
             double lambda           /* current lambda */
          );
       double computeSecondDerivative(
       MODEL *model,           /* the model */
             DATASET *dataset,       /* the dataset */
       double lambda         /* current lambda */
          );
   - use Newton Raphson Method to maximize lambda    in m_step()
           /* finding the lambda for the next EM iteration is different for TCM model if secondary structure information is given */
           if ((dataset->secondaryStructureFilename != NULL) && (model->mtype == Tcm)) {
                double best_lambda = NewtonRaphsonMethod(model,dataset);       /* use Newton Raphson Method to find the best lambda */
                model->nsites_obs = q;
              model->lambda_obs = MIN(1.0, best_lambda);
              model->nsites = q*(1-wnsites) + model->psites*wnsites;
                /* lambda = (q(1-wnsites) + psites*wnsites) / n*m
               --> since lambda_obs = q/(n*m) it follows
               lambda = lambda_obs - lambda_obs*wnsites + (psites*wnsites/(n*m))
                */
              model->lambda = MIN(1.0, (model->lambda_obs - model->lambda_obs*wnsites + model->psites*wnsites/wps(dataset, w)) );
            }else{
   - define the functions NewtonRaphsonMethod(), computeFirstDerivative(), computeSecondDerivative()
   - use log sigma_ij instead of log 1/m if secondary structure information is given in e_step() for oops and
      use gamma * sigma_ij instead of gamma/m in e_step() for zoops
            if (dataset->secondaryStructureFilename != NULL) {
               if (mtype==Oops) {
                 log_gamma = s->log_sigma[j];
               }else{
                 log_lambda = LOG(gamma * s->sigma[j]);
               }
           }



tcm.c
   - compute the position specific lambda values from the secondary structure information and check if the pseudocount has to be adjusted (tcm_e_step())
           /* use log sigma_ij * lambda * m instead of lambda if secondary structure information is given */
           /* NOTE: log_sigma is the prior for + or - strand --> here only + strand --> log_sigma = 0 */
           if (dataset->secondaryStructureFilename != NULL) {

          /* first check if the maximum sigma * (lambda*m) > 1   --> if so P(Zij=1 | \phi) can be > 1 */
             double Pcount = dataset->secondaryStructurePseudocount;
           double maxPrior = ((s->max_ss_value + Pcount) / (s->sum_ss_value + (m * Pcount))) * (lambda*m);

              if (maxPrior > 1.0 ) {
               /* compute new pseudocount that gives sigma_i_max = 1 = (max_ss_value + pseudocount) / (\sum (ss_value[i] + pseudocount)) * lambda * m */
               Pcount = (-1 * s->max_ss_value * lambda * m + s->sum_ss_value) / (m * (lambda - 1));

               /* for statistics keep maximum adjustment */
               if (Pcount - dataset->secondaryStructurePseudocount > MAXADJUST) {
                   MAXADJUST = Pcount - dataset->secondaryStructurePseudocount;
               }
           }

           /* compute new sigmas with this pseudocount */
           double sum = s->sum_ss_value + (m * Pcount);         /* \sum ss_value[i] + m*pseudocount */
       for (j=0; j < m; j++) {
          s->sigma[j] = (s->ss_value[j] + Pcount) / sum;
           }
          }
   - use the position specific lambda values
           /* use the prior instead of lambda */
          if (dataset->secondaryStructureFilename != NULL) {
              double p = MIN(1.0, ( s->sigma[j] * lambda * m ) ) ;       /* rounding */
              log_lambda = LOG(p);
              log_1mlambda = LOG(1-p);
           }

subseq7.c
   - in case of two equally good starting points choose the one with higher single-strandedness for the substring in function score_llr_pop()
       /* in case of two equally good starting points choose the one with higher single-strandedness for the substring (if we have secondary structure information) */
    }else if (-log_pop == s_points[i_nsites0].score) {
         if (dataset->secondaryStructureFilename != NULL) {
              double sigma_old = samples[ s_points[i_nsites0].iseq ]->sigma[ s_points[i_nsites0].ioff ];
              double sigma_new = samples[ iseq ]->sigma[ ioff ];
              if (sigma_new > sigma_old) {
                   s_points[i_nsites0].iseq = iseq;
                 s_points[i_nsites0].ioff = ioff;
                  s_points[i_nsites0].e_cons0 = eseq;
               s_points[i_nsites0].wgt_nsites = wN;
                  s_points[i_nsites0].score = -log_pop;
               }
           }
   - to use secondary structures only during the start point search, add a bool parameter to global_max() and local_max()
          , BOOLEAN useSecStruct
      and call global_max and local_max with this flag
           /* during the start point search sort==TRUE; during the discretize model step sort==FALSE
           since we want to include secondary structure information only during the start point search, use sort as a flag and pass it to global_max */
           n_maxima = global_max(dataset, w, maxima, ic, sort);
         } else {
           /* during the start point search sort==TRUE; during the discretize model step sort==FALSE
            since we want to include secondary structure information only during the start point search, use sort as a flag and pass it to local_max */
           n_maxima = local_max(dataset, w, maxima, ic, sort);


   - use pY[j]*sigma[j] instead of pY[j] alone to determine the maximum in global_max() for ZOOPS and OOPS in the start point search (pY is the value for the current position of Theta0)
          max += INT_LOG(0);            /* to account for a sigma[j] = 0 */
          int log_sigma = 0;
             if ((dataset->secondaryStructureFilename != NULL) && (useSecStruct)) {
               log_sigma = s->intlog_sigma[j];
           }
             if (pY[j] + log_not_o[j] + log_sigma > max) {       /* new maximum found */
             /*if (pY[j] + log_not_o[j] > max) {       /* new maximum found */
                   max = pY[j] + log_not_o[j] + log_sigma;        /* pY * Pr(no overlap) */
                   /*max = pY[j] + log_not_o[j];        /* pY * Pr(no overlap) */
   - use pY[j]*sigma[j] instead of pY[j] alone to determine the maximum in local_max() for TCM start point search
          int log_sigma = 0;
            if ((dataset->secondaryStructureFilename != NULL) && (useSecStruct)) {
                 log_sigma = s->intlog_sigma[0];
                   max += log_sigma;
           }
            if ((dataset->secondaryStructureFilename != NULL) && (useSecStruct)) {
              log_sigma = s->intlog_sigma[j];
             prob += log_sigma;
          }

display.c
   - output EF/PU values (if given) for the motif hits and the average EF/PU in align_sites()
          double aveSS = 0;
          int numHits = 0;
          /* added by M.H. */
          /* remove \n and add a > 0 ? statement */
          fprintf(outfile, "%6s %9s %10s %*sSite%*s", "Start", "P-value", "", ( (w/2-2) >0 ? (w/2-2) : 0), "", ((w - w/2 - 4) > 0 ? (w - w/2 - 4) : 0), "");
          /* fprintf(outfile, "%6s %9s %10s %*sSite%*s\n", "Start", "P-value", "", w/2 - 2, "", w - w/2 - 4, "");*/
          /* added by M.H. */
          /* output 'SecStruc' */
          if (dataset->secondaryStructureFilename != NULL) {
              fprintf(outfile, "%*s %10s %8s\n", MAX(0, w - ( (w-w/2-4) > 0 ? (w-w/2-4) : 0 ) - ( (w/2-2) >0 ? (w/2-2) : 0) - 4 ) , "","","SecStruc");
          }else{
              fprintf(outfile, "\n");
          }
          if (dataset->secondaryStructureFilename != NULL) {
              fprintf(outfile, " %10s %8s", "", "--------");
          }
          /* remove \n */
         /* print the alignment */
         if (pre[0] == '\0') {           /* print a dot in empty pre */
        fprintf(outfile, " %10s %-*s %-10s", ".", w, site, post);
         } else {
        fprintf(outfile, " %10s %-*s %-10s", pre, w, site, post);
         }
          /* print EF/PU value and compute average */
         if (dataset->secondaryStructureFilename != NULL) {
             fprintf(outfile, " %f\n",s->ss_value[y]);
              aveSS += s->ss_value[y];
               numHits ++;
         }else{
              fprintf(outfile, "\n");
         }
          /* print average */
          if (dataset->secondaryStructureFilename != NULL) {
               aveSS /= numHits;
              fprintf(outfile, "average single-strandedness: %f\n",aveSS);
          }