MEMERIS modifications in the MEME 3.5.1. source code


meme.c
    - declaration a new function
            static void readSecondaryStructurePrior(
                DATASET *dataset,            /* the dataset */
              MODEL *model,                 /* the model */
              char *secondaryStructureFilename,    /* input filename */
              double secondaryStructurePseudocount  /* pseudocount for prior prob */
            );
    - define this function
            static void readSecondaryStructurePrior(
            ...
    - call readSecondaryStructurePrior() function if -secstruct is set
           if (dataset->secondaryStructureFilename != NULL) {
                printf("use secondary structure information from: %s\n", dataset->secondaryStructureFilename); fflush(stdout);
              readSecondaryStructurePrior(dataset, model, dataset->secondaryStructureFilename, dataset->secondaryStructurePseudocount);
                 printf("done\n"); fflush(stdout);
            }        
    - output the maximum adjustment of the pseudocount in case of TCM
          if ((dataset->secondaryStructureFilename != NULL) && (model->mtype = Tcm)) {
            printf("max adjustment of pseudocount: %f\n",MAXADJUST);
         }



meme.h
    - new global variables
          DEXTERN(double, MAXADJUST, 0);        /* maximum adjustment of the pseudocount  -- for statistics */
          DEXTERN(int, MAX_NEWTON_ITERATIONS, 30);        /* maximum number of iterations for Newton-Raphson Method */
    - introduce new parameters for the secstruct filename and the pseudocount for dataset
          char *secondaryStructureFilename;  /* name of the file with the secondary structure information */
          double secondaryStructurePseudocount; /* pseudocount added to each position to avoid a prior prob of 0 */
    - introduce variables that store the EF/PU values, the sigma values and LOG/INTLOG precomputations
          double *ss_value;    /* secondary structure value (EF or PU) = measure for single-strandedness of motif */
          double *sigma;        /* prior prob for Zij */
          double *log_sigma; /* LOG(sigma) */
          int *intlog_sigma; /* INTLOG(sigma) */
          double sum_ss_value;    /* \sum ss_value[i] */
          double max_ss_value;     /* max ss_value[i] (used for TCM) */
    
    
    
init.c
    - introduce two new static variables
            static char *secondaryStructureFilename = NULL;        /* if not NULL, the secondary structure information from this filename is used for the prior probability */
            static double secondaryStructurePseudocount = 0.1; /* pseudocount added to each position to avoid a prior prob of 0 */
    - read the two parameters -secstruct and -pi in init_meme()
            DATA_OPTN(1, secstruct, <filename>, \tuse secondary structure information from <filename> for the a priori probability, secondaryStructureFilename = _OPTION_);
            DATA_OPTN(1, pi, <double>, \t\tpseudocount added to each secondary structure prior (default 0.1), secondaryStructurePseudocount = atof(_OPTION_));
    - check if a motif length is given, check for DNA alphabet and check if -revcomp and -secstruct is set (if so disallow -revcomp)
          if (secondaryStructureFilename != NULL) {
            if (invcomp) {
              invcomp = FALSE;         
               fprintf(stderr, "ERROR: you set 'revcomp' and use secondary structure information: revcomp setting is not used\n");
            }
             if (w == 0) {
               fprintf(stderr, "ERROR: you must set the motif length (-w) to the value used for the computation of the secondary structure values\n");
                exit(1);
             }
             if (strcmp(alph, "DNA") != 0) {
                alph = "DNA";    
               fprintf(stderr, "ERROR: you set 'protein' and use secondary structure information: switch to DNA alphabet\n");
             }    
          }
    - set model->w if only one w is allowed
          if (model->max_w == model->min_w)
            model->w = model->max_w;
    - store the filename in dataset->secondaryStructureFilename and the pseudocount dataset->secondaryStructurePseudocount
          dataset->secondaryStructureFilename = secondaryStructureFilename;
          dataset->secondaryStructurePseudocount = secondaryStructurePseudocount;



oops.c
    - declaration of functions for TCM M-step
          double NewtonRaphsonMethod(
             MODEL *model,            /* the model */
             DATASET *dataset        /* the dataset */
          );
          double computeFirstDerivative(
             MODEL *model,            /* the model */
             DATASET *dataset,        /* the dataset */
             double lambda            /* current lambda */
          );
         double computeSecondDerivative(
             MODEL *model,            /* the model */
             DATASET *dataset,        /* the dataset */
             double lambda         /* current lambda */
          );
    - use Newton Raphson Method to maximize lambda     in m_step()
           /* finding the lambda for the next EM iteration is different for TCM model if secondary structure information is given */
           if ((dataset->secondaryStructureFilename != NULL) && (model->mtype == Tcm)) {
                double best_lambda = NewtonRaphsonMethod(model,dataset);        /* use Newton Raphson Method to find the best lambda */
                model->nsites_obs = q;
                model->lambda_obs = MIN(1.0, best_lambda);
                model->nsites = q*(1-wnsites) + model->psites*wnsites;
                /* lambda = (q(1-wnsites) + psites*wnsites) / n*m
                --> since lambda_obs = q/(n*m) it follows
                lambda = lambda_obs - lambda_obs*wnsites + (psites*wnsites/(n*m))
                */
                model->lambda = MIN(1.0, (model->lambda_obs - model->lambda_obs*wnsites + model->psites*wnsites/wps(dataset, w)) );
            }else{
    - define the functions NewtonRaphsonMethod(), computeFirstDerivative(), computeSecondDerivative()
    - use log sigma_ij instead of log 1/m if secondary structure information is given in e_step() for oops and
      use gamma * sigma_ij instead of gamma/m in e_step() for zoops
            if (dataset->secondaryStructureFilename != NULL) {
                if (mtype==Oops) {
                   log_gamma = s->log_sigma[j];
                }else{
                   log_lambda = LOG(gamma * s->sigma[j]);
                }
            }
    
    

tcm.c        
    - compute the position specific lambda values from the secondary structure information and check if the pseudocount has to be adjusted (tcm_e_step())
           /* use log sigma_ij * lambda * m instead of lambda if secondary structure information is given */
           /* NOTE: log_sigma is the prior for + or - strand --> here only + strand --> log_sigma = 0 */
           if (dataset->secondaryStructureFilename != NULL) {
    
              /* first check if the maximum sigma * (lambda*m) > 1   --> if so P(Zij=1 | \phi) can be > 1 */
             double Pcount = dataset->secondaryStructurePseudocount;
             double maxPrior = ((s->max_ss_value + Pcount) / (s->sum_ss_value + (m * Pcount))) * (lambda*m);

              if (maxPrior > 1.0 ) {
                /* compute new pseudocount that gives sigma_i_max = 1 = (max_ss_value + pseudocount) / (\sum (ss_value[i] + pseudocount)) * lambda * m */
                Pcount = (-1 * s->max_ss_value * lambda * m + s->sum_ss_value) / (m * (lambda - 1));
    
                /* for statistics keep maximum adjustment */
                if (Pcount - dataset->secondaryStructurePseudocount > MAXADJUST) {
                    MAXADJUST = Pcount - dataset->secondaryStructurePseudocount;
                }
             }
         
             /* compute new sigmas with this pseudocount */
             double sum = s->sum_ss_value + (m * Pcount);           /* \sum ss_value[i] + m*pseudocount */
             for (j=0; j < m; j++) {
                  s->sigma[j] = (s->ss_value[j] + Pcount) / sum;
             }
          }
    - use the position specific lambda values
           /* use the prior instead of lambda */
            if (dataset->secondaryStructureFilename != NULL) {
              double p = MIN(1.0, ( s->sigma[j] * lambda * m ) ) ;        /* rounding */
              log_lambda = LOG(p);    
                log_1mlambda = LOG(1-p);    
           }



subseq7.c
    - in case of two equally good starting points choose the one with higher single-strandedness for the substring in function score_llr_pop()
         /* in case of two equally good starting points choose the one with higher single-strandedness for the substring (if we have secondary structure information) */
        }else if (-log_pop == s_points[i_nsites0].score) {
          if (dataset->secondaryStructureFilename != NULL) {
                 double sigma_old = samples[ s_points[i_nsites0].iseq ]->sigma[ s_points[i_nsites0].ioff ];
                 double sigma_new = samples[ iseq ]->sigma[ ioff ];
                 if (sigma_new > sigma_old) {
                      s_points[i_nsites0].iseq = iseq;
                  s_points[i_nsites0].ioff = ioff;
                  s_points[i_nsites0].e_cons0 = eseq;
                  s_points[i_nsites0].wgt_nsites = wN;
                  s_points[i_nsites0].score = -log_pop;
                }
            }
    - to use secondary structures only during the start point search, add a bool parameter to global_max() and local_max()
          , BOOLEAN useSecStruct
      and call global_max and local_max with this flag
           /* during the start point search sort==TRUE; during the discretize model step sort==FALSE
             since we want to include secondary structure information only during the start point search, use sort as a flag and pass it to global_max */
           n_maxima = global_max(dataset, w, maxima, ic, sort);
         } else {
           /* during the start point search sort==TRUE; during the discretize model step sort==FALSE
            since we want to include secondary structure information only during the start point search, use sort as a flag and pass it to local_max */
           n_maxima = local_max(dataset, w, maxima, ic, sort);
          

    - use pY[j]*sigma[j] instead of pY[j] alone to determine the maximum in global_max() for ZOOPS and OOPS in the start point search (pY is the value for the current position of Theta0)
             max += INT_LOG(0);             /* to account for a sigma[j] = 0 */
             int log_sigma = 0;
             if ((dataset->secondaryStructureFilename != NULL) && (useSecStruct)) {
               log_sigma = s->intlog_sigma[j];
            }
             if (pY[j] + log_not_o[j] + log_sigma > max) {        /* new maximum found */
              /*if (pY[j] + log_not_o[j] > max) {        /* new maximum found */
                    max = pY[j] + log_not_o[j] + log_sigma;         /* pY * Pr(no overlap) */
                    /*max = pY[j] + log_not_o[j];         /* pY * Pr(no overlap) */
    - use pY[j]*sigma[j] instead of pY[j] alone to determine the maximum in local_max() for TCM start point search
            int log_sigma = 0;
            if ((dataset->secondaryStructureFilename != NULL) && (useSecStruct)) {
                   log_sigma = s->intlog_sigma[0];
                   max += log_sigma;
             }
            if ((dataset->secondaryStructureFilename != NULL) && (useSecStruct)) {
                 log_sigma = s->intlog_sigma[j];
             prob += log_sigma;
            }


display.c
    - output EF/PU values (if given) for the motif hits and the average EF/PU in align_sites()
          double aveSS = 0;
          int numHits = 0;
          /* added by M.H. */
          /* remove \n and add a > 0 ? statement */
          fprintf(outfile, "%6s %9s %10s %*sSite%*s", "Start", "P-value", "", ( (w/2-2) >0 ? (w/2-2) : 0), "", ((w - w/2 - 4) > 0 ? (w - w/2 - 4) : 0), "");
          /*  fprintf(outfile, "%6s %9s %10s %*sSite%*s\n", "Start", "P-value", "", w/2 - 2, "", w - w/2 - 4, "");*/
          /* added by M.H. */
          /* output 'SecStruc' */
          if (dataset->secondaryStructureFilename != NULL) {
                 fprintf(outfile, "%*s %10s %8s\n", MAX(0, w - ( (w-w/2-4) > 0 ? (w-w/2-4) : 0 ) - ( (w/2-2) >0 ? (w/2-2) : 0) - 4 ) ,  "","","SecStruc");
          }else{
                 fprintf(outfile, "\n");
          }
          if (dataset->secondaryStructureFilename != NULL) {
              fprintf(outfile, " %10s %8s", "", "--------");
          }
          /* remove \n  */
         /* print the alignment */
         if (pre[0] == '\0') {            /* print a dot in empty pre */
              fprintf(outfile, " %10s %-*s %-10s", ".", w, site, post);
         } else {
              fprintf(outfile, " %10s %-*s %-10s", pre, w, site, post);
         }
          /* print EF/PU value and compute average */
         if (dataset->secondaryStructureFilename != NULL) {
                  fprintf(outfile, " %f\n",s->ss_value[y]);
                 aveSS += s->ss_value[y];
                numHits ++;
           }else{
                 fprintf(outfile, "\n");
           }
          /* print average */
          if (dataset->secondaryStructureFilename != NULL) {
                  aveSS /= numHits;
                 fprintf(outfile, "average single-strandedness: %f\n",aveSS);
          }