\datethis @s octa int @*Intro. This program is the twelfth in a series of exploratory studies by which I'm attempting to gain first-hand experience with OBDD structures, as I prepare Section 7.1.4 of {\sl The Art of Computer Programming}. Again it's quite different from its predecessors: This one implements a new approach to finding an optimum ordering for the variables of a given Boolean function or set of functions. The new approach is based on QDDs (quasi-reduced BDDs), which undergo the ``jump-up'' operation but not the normal synthesis operations of a traditional BDD package. It requires no hashing or garbage collection. The given function is specified explicitly by generating its QDD. As a demonstration, we implement the $2^m$-way multiplexer $M_m(x_1,\ldots,x_m;x_{m+1},\ldots,x_{m+2^m})$ here, because it exhibits great extremes of BDD size under different orderings. But with \.{CWEB} change files any other function can be substituted. If desired, optimization will be restricted to a subrange of the possible levels, keeping variables at the top and bottom of the BDD in place. We assume that |botvar-topvar| is at most 24. @d mm 2 /* order of MUX in this demonstration version */ @d nn (mm+(1<=1| */ @d botvar nn /* last variable whose order will be varied (must be |<=nn| */ @d nnn (botvar+1-topvar) /* variables being permuted (at most 25) */ @# @d o mems++ /* count a memory access to an octabyte */ @d oo mems+=2 /* or two */ @d ooo mems+=3 /* or three */ @d oooo mems+=4 /* or four, wow */ @c #include #include @@; @@; unsigned long long mems; @@; main () { register int h,i,j,k,l,lo,hi,jj,kk,var,cycle; octa x; @; for (cycle=1;cycle<1<<(nnn-1);cycle++) { if (cycle%interval==0) { printf("Beginning cycle %d (%llu mems so far)\n",cycle,mems); fflush(stdout); } @; } @

; printf("Altogether %llu mems.\n",mems); } @ @= typedef unsigned long long octa; /* an octabyte */ @* QDD representation. The quasi-reduced binary decision decision for a Boolean function of $n$ variables has $q_k$ nodes on level~$k$, for $0\le k\le n$, one for every distinct subfunction that can arise by hard-wiring constant values for the initial variables $(x_1,\ldots,x_k)$. The sequence $(q_0,\ldots,q_n)$ is called the function's {\it quasi-profile}. The maximum value of $q_k$ is $\min(t2^k,2^{2^{n-k}})$ when there are $t$ output functions of $n$ input variables. In particular, the maximum quasi-profile when $n=25$ and $t=1$ is $$(1,2,4,\ldots,2^{19},2^{20},2^{16},2^8,2^4,2^2,2);$$ its potentially biggest element, $2^{20}$, occurs when $k=20$. In this implementation the nodes on level $k$ are numbered from 0 to $q_k-1$, and we store them consecutively in a big array |node[k]|. Each node contains three fields, (|lo|,|hi|,|dep|), packed into a 64-bit word. The |lo| and |hi| fields, 20 bits each, point to nodes at level~$k+1$; the |dep| field, which occupies the other 24 bits, represents the set of variables on which this subfunction depends. Level $n$ is special; it has the two ``sink'' nodes 0 and~1, which are represented by |(0,0,0)| and |(1,1,0)|, respectively. For example, suppose $n=3$ and $f(x_1,x_2,x_3)=(\bar x_1\land x_2) \lor(x_1\land x_3)$. The nodes at level~2, which correspond to branching on~$x_3$, are |node[2][0]=(0,0,0)|, |node[2][1]=(1,1,0)|, and |node[2][2]=(0,1,0x4)|, representing the respective subfunctions 0, 1,~$x_3$. The nodes at level~1, which correspond to branching on~$x_2$, are |node[1][0]=(0,1,0x2)| and |node[1][1]=(2,2,0x4)|, representing the subfunctions $x_2$ and $x_3$. And there's one node at level~0, namely |node[0][0]=(0,1,0x7)|. @= octa *node[nnn+1]; /* the node arrays */ int qq[nnn+1]; /* the quasi-profile: $|qq|[k]=q_{k-topvar+1}$ */ @ To launch this structure, we first need to allocate the |node| arrays. They are actually created only for levels |topvar-1| through |botvar|. @= for (k=0;k<=nnn;k++) { j=worksize,kk=k+topvar-1; if (nn-kk<5 && j>1<<(1<<(nn-kk))) j=1<<(1<<(nn-kk)); if (kk<20 && ((worksize/outs)>>kk)>0 && (j>outs*(1<>1|, and I treat level~0 as a special case. This saves time even when $n<25$. (If I wanted to handle more than 25 variables, I could use a more elaborate scheme in which the packing conventions vary from level to level. But 25 is plenty for me at the moment.) @d pack(lo,hi,dp) (((((octa)(dp)<<20)+(lo))<<20)+(hi)) @d lofield(x) (((x)>>20)&0xfffff) @d hifield(x) ((x)&0xfffff) @d depfield(x) (((x)>>40)<<1) @d extrabit(k,j) (k==0? lofield(node[0][j])!=hifield(node[0][j]):0) @ Once the |node| arrays exist, we can set up the initial QDD. Here I implement the familiar function $$M_m(x_1,\ldots,x_m;x_{m+1},\ldots,x_{m+2^m})\;=\; x_{m+1+(x_1\ldots x_m)_2}.$$ (In fact, the example $f(x_1,x_2,x_3)$ above is $M_1(x_1;x_2,x_3)$.) Only the |lo| and |hi| fields are initialized here, because we will use the reduction routine to compute appropriate |dep|s. The following code assumes that |topvar=1| and |botvar=nn|. @= for (k=0;k; @ For diagnostic purposes, I might want to pretty-print the current QDD. @= void print_level(int k) { register int j; printf("level %d (x%d):\n",k,map[k]+topvar); for (j=0;j= typedef struct {@+unsigned int l,r;@+} pair; @ @= pair work[2*worksize],head[worksize]; /* the big workspace */ @ We use |j+1| to link to entry |j|, so that null links are distinguishable from links to entry~0. @= void print_work(int k) { register int lo,hi; printf("Current workspace for level %d:\n",k); for (lo=0;lo= void reduce(int k) { register int lo,hi,dep,p,nextp,q; q=0; /* the number of nodes created so far */ for (o,lo=0;lo; } for (p=head[lo].r;p;p=nextp) { o,nextp=work[p-1].r,hi=work[p-1].l; o,head[hi].l=0; /* clean up */ } o,head[lo].r=0; /* reset the |lo| list to empty */ } o,qq[k]=q; } @ @= { if (q>=worksize) { fprintf(stderr,"Sorry, level %d of the QDD is too big (worksize=%d)!\n", k,worksize); exit(-3); } if (lo!=hi) oo,dep=depfield(node[k+1][lo] | node[k+1][hi])|(1<>1); o,clone[p-1]=q; o,head[hi].l=++q; } @ @= int clone[2*worksize]; /* the new node addresses */ @ Let's illustrate |reduce| by using it to tidy up the initial QDD. The main point of interest is that we must remap the pointers on level~$k-1$ after level~$k$ has been reduced. @d makework(j,lo,hi) work[j].l=hi,work[j].r=head[lo].r,head[lo].r=j+1 @= for (k=nnn-1;;k--) { for (o,j=0;jj$, takes the variable at level~$k$ of the branching structure and moves it up to level~$j$, sliding the previous occupants of levels $(j,j+1,\ldots,k-1)$ down one notch. For example, suppose $n=7$ and we do jump-up $5\to2$. Before the operation, levels 0 thru~6 represent branching on variables $(x_1,x_2,x_3,x_4,x_5,x_6,x_7)$, respectively; after jumping, they represent branching on $(x_1,x_2,x_6,x_3,x_4,x_5,x_7)$. The |dep| fields are always based on the assumption that level $k$ branches on variable $x_{k+1}$. An auxiliary |map| table records the results of previous jumps; level~$k$ actually branches on variable $x_{map[k]+topvar}$ of the original unpermuted function. We also maintain a |bitmap| array, where |bitmap[k]| contains the bits that encode the set $\{|map|[0],\ldots,|map|[k-1]\}$. @= int map[nnn]; /* the current permutation */ int bitmap[nnn]; /* its initial dependency sets */ @ @= for (j=k=0;j= o,var=map[kk]; for (k=kk;k>jj;k--) { @; reduce(k); oo,map[k]=map[k-1]; oo,bitmap[k]=bitmap[k-1]|(1<; @ Simple but cool list processing does the trick here. @= for (o,j=0;j= for (o,j=0;j= int b[1<1$, take the sequence for $n-1$ and place $n$ at the bottom; then jump $n$ up to the top; then use the sequence for $n-1$ on the {\it lower\/} $n-1$ elements. This idea turns out to be equivalent to the following: The $k$th jump-up is $\nu k+\rho k\to\nu k-1$, for $1\le k<2^{n-1}$. (In this formula $\nu k$ denotes the number of 1s in the binary representation of~$k$, and $\rho k$ denotes the number of 0s at the right end of that representation.) For example, when $n=4$ the jumps are $1\to0$, $2\to0$, $2\to1$, $3\to0$, $2\to1$, $3\to1$, $3\to2$; the permutations are $$\thinmuskip=\thickmuskip 1234,2134,3214,3124,4312,4132,4213,4231.$$ The idea is that all $k$-combinations that don't contain~$n$ appear in the first half; those that do contain~$n$ appear in the second half. @= for (jj=0,kk=1,k=cycle;(k&1)==0;kk++) k>>=1; /* compute $\rho k$ */ for (k&=k-1;k;jj++,kk++) k&=k-1; /* compute $\nu k$ */ @; for (k=jj+1;k<=kk;k++) @; @ We also need to gather statistics from the initial QDD. @= @; for (k=1;k; @ Every node in |node[k]| is marked with a |dep| field, telling which of the remaining variables it depends on. More precisely, if the |dep| contains the bit |1<= { switch (((nnn-2)>>3)*4+((k-1)>>3)) { case 2*4+0:@+for (j=0;j<256;j++) ooo,count1[j]=count2[j]=count3[j]=0; for (o,j=0;j>56]++; oo,count2[(x>>48)&0xff]++; oo,count3[(x>>40)&0xff]++; };@+break; case 2*4+1:@+for (j=0;j<256;j++) oo,count1[j]=count2[j]=0; for (o,j=0;j>56]++; oo,count2[(x>>48)&0xff]++; };@+break; case 2*4+2:@+for (j=0;j<256;j++) o,count1[j]=0; for (o,j=0;j>56]++; };@+break; case 1*4+0:@+for (j=0;j<256;j++) oo,count2[j]=count3[j]=0; for (o,j=0;j>48)&0xff]++; oo,count3[(x>>40)&0xff]++; };@+break; case 1*4+1:@+for (j=0;j<256;j++) o,count2[j]=0; for (o,j=0;j>48)&0xff]++; };@+break; case 0*4+0:@+for (j=0;j<256;j++) o,count3[j]=0; for (o,j=0;j>40)&0xff]++; };@+break; } for (j=k;j; } @ @= int count1[256],count2[256],count3[256]; /* bit pattern counts */ @ @= { l=1<<((j-1)&0x7),i=0; if (j<9) { for (i=0,h=l;h<256;h=(h+1)|l) o,i+=count3[h]; }@+else if (j<17) { for (i=0,h=l;h<256;h=(h+1)|l) o,i+=count2[h]; }@+else for (i=0,h=l;h<256;h=(h+1)|l) o,i+=count1[h]; oooo,b[bitmap[k]][map[j]]=i; } @ At this point we have the initial QDD, with |map[j]=j| for all |j|. @= for (j=0;j>k)&1; } @ At last comes the final reckoning: We can compute the minimum cost that can be achieved when subset |X| occurs at the top of the BDD, for $1\le X<2^n$. While we're at it, we might as well compute the maximum cost too. @

= for (k=1;k<1<h) h=l,lo=j; } cost[k]=h,routing[k]=lo; } printf("Pessimum ordering (cost %d+externals) can be achieved thus:\n", cost[(1<= int cost[1<