/* Program: GRAFWALK Author: Arif Zaman Last changed: 18 Oct, 1994 Usage: GRAFWALK [options] datafile or GRAFWALK -h With -h, a detailed description of the options is given. The program reads in the matrix in "datafile" and writes a file with the extenstion ".out". The input file is the starting matrix. The output file consists of the expected values of various statistics and the estimated error of these expected values. The expectations are computed under the assumption that all matrices with the same row and column sums as the original matrix are equally likely. Statistics: Any statistic's expectation and standard deviation can be estimated using this program. Currently it is set to measure the p-values of similarity and dis-similarity in the rows of the matrix. The similarity of two rows i and j can be measured by the Adjusted Cross-Product Ratio (CPR), where (1+n_00[i,j]) x (1+n_11[i,j]) CPR(i,j) = ----------------------------- (1+n_01[i,j]) x (1+n_11[i,j]) and n_ab is the number of columns where an a in row i has a corresponding b in row j. This number is large when the rows are similar and small when different Define the related quantities: CPR(i) = Prod CPR(i,j) for all j not equal to i. CPRa(i) = Prod a(CPR(i,j)) for all j not equal to i. CPR = Prod CPR(i) for all i. CPRp = Prod CPRp(i) for all i. where a(x)=x if x>=1, and a(x)=1/x for x<1, i.e. a(x) = exp(abs(log(x))). With these definitions, CPR(i) is a measure of the similarity of row i with all the other rows. CPR measures the overall similarity of rows in the entire matrix. The CPRp measures are similar, except that they are large when rows are either too similar or too different, so they measure interaction of any kind. By the symbol CPR0, we refer to the above statistics computed for the original data matrix. Comparing CPR0 against a CPR computed for a random matrix, we can test the following Hypotheses: 1) Hij: CPR(i,j) = CPR0(i,j) Ha: CPR(i,j) > CPR0(i,j) 2) Ha: CPR(i,j) < CPR0(i,j) 3) Hi: CPR(i) = CPR0(i) Ha: CPR(i) > CPR0(i) 4) Ha CPR(i) < CPR0(i) 5) CPRp(i) = CPRp0(i) Ha: CPRp(i) > CPRp0(i) 6) H: CPR = CPR0 Ha: CPR > CPR0 7) Ha: CPR < CPR0 8) CPRp = CPRp0 Ha: CPRp > CPRp0 Note that only the onesided tests make sense in (5) and (8). */ /* FUZZ is used in comparisons for p-values. Two floating point values are considered equal if they don't differ by more than FUZZ. */ #define FUZZ 1e-8 /* The precision used in all the floating calculations */ #define reals double #include #include #include #include #include #include int rows,cols; /* Adjustable arrays: char rsum[rows], csum[cols], m[rows][cols], n10[rows][rows]; the code looks uglier but the program is more flexible this way. */ char *matrix_m, *rsum, *csum, *matrix_n10; #define m(i,j) matrix_m[i*cols+j] #define n10(i,j) matrix_n10[i*rows+j] int r1, r2, c1, c2, nbrs, sum; /*--------------------------------------------------------- INITIALIZE Read in the matrix from a file. Compute row/column sums. Compute the n10 matrix. Compute the number of neighbors: nbrs. */ void initialize(FILE *in) { int r, c; fscanf(in,"%d %d\n",&rows,&cols); matrix_m = malloc(rows*cols+3); rsum = malloc(rows); csum = malloc(cols); matrix_n10=malloc(rows*rows); if (matrix_n10==NULL) { printf("Not Enough memory\n"); exit(1); } for (r=0; r>8) % m; } /*--------------------------------------------------------- FLIP This routine should ONLY be called with r1, r2, c1 and c2 set so that m[r1,c1]=m[r2,c2]=0 and m[r1,c2]=m[r2,c1]=1. Assuming this, it reverses the 4 cells at the intersections of r1, r2 and c1, c2. Note that this operation keeps the row and column total the same. */ void flip(void) { int r; m(r1,c1) = 1; m(r1,c2) = 0; m(r2,c1) = 0; m(r2,c2) = 1; for (r=0; r=0; r1++) for (r2=r1+1; (r2=0); r2++) n -= n10(r1,r2)*n10(r2,r1); r1--; r2--; n += n10(r1,r2)*n10(r2,r1); n1 = n / n10(r1,r2); n2 = n % n10(r1,r2); for (c=0; (n1>=0) || (n2>=0); c++) { if (m(r1,c)==0) { if (m(r2,c)==1) if ((n1--)==0) c1=c; } else { if (m(r2,c)==0) if ((n2--)==0) c2=c; } } flip(); } /* STATISTICS The random walk visits each configuration with a (stationary) probability proportional to the number of neighbors it has. So to compute the expectation of any random variable T, we compute it at each configuration of the random walk, but take a weighted average: E(T) = sum (ti*wi) / sum (wi) where wi are the weights (1/nbrs). The expectation is easy, but to find the variance, we need to recognize that the terms have autocorrelations and that the expression for E(T) is a ratio estimator, since both numerator and denominator are random. To deal with the autocorrelation, we aggregate a sequence of (nmu) terms and get Y1 = sum (ti*wi) and X1 = sum(wi) for i=1...nmu. We then skip (nskp) terms to get rid of autocorrelation and then construct X2, Y2, ... Xn, Yn the same way. For this ratio estimator, a biased estimator for the variance is given as: E(X) = r = sum Yi / sum Xi V(r) = (sum Xi) ^ (-2) sum (Yi - r Xi)^2 This can be slightly improved by not skipping any terms for the computation of E(X). This will improve the estimate, i.e. reduce the V(r), but the amount of improvement is difficult to estimate. So we use the improved estimate but the un-improved variance estimate to get a conservative confidence interval. */ reals xi, sumxi, sumxi2, improved_sumxi; /* The following matrices are used to estimate the variance of the */ /* estimates. They have one entry for each test, so the total size */ /* is rows*(rows+2)+3 */ long size; /* rows * (rows+2) + 3 */ reals *t, /* The CPR's of the original data */ *sim, /* Statistics for similarity of rows */ *onesided, /* Statistics for interaction of rows */ *yi, /* Y's as defined above */ *sumyi, /* Partial sums of Y's */ *sumyi2, /* Sums of Y^2 */ *sumxiyi; /* Sums of X*Y */ /* these are pointer within yi, contianing */ /* p-values of tests 3,4,5,6,7 and 8 */ reals *yi_rowsim, *yi_rowdif, *yi_mat; #define yi_rowint(r) yi[(r)*(rows+1)] long int num; /*--------------------------------------------------------- INISTAT For the very first time compute the cross product ratio for every combination of rows for the original data */ void inistat(void) { long bytesize; size = rows * (rows+2) + 3; bytesize = size * sizeof(*t); sim = malloc(rows*sizeof(*sim)); onesided = malloc(rows*sizeof(*onesided)); t = malloc(bytesize); yi = malloc(bytesize); sumyi = malloc(bytesize); sumyi2 = malloc(bytesize); sumxiyi = malloc(bytesize); if (sumxiyi==NULL) { printf("Not enough memory. Array size too big.\n"); exit(1); } memset(t, 0,bytesize); memset(yi, 0,bytesize); memset(sumyi, 0,bytesize); memset(sumyi2, 0,bytesize); memset(sumxiyi,0,bytesize); for (r1=0; r1=1.0-FUZZ) yi[r2r1] += wt; sim[r1] *= ratio; sim[r2] *= ratio; if (stat<1.0) { if (t[r1r2]<1.0) ratio = 1/ratio; else ratio = 1/(stat*t[r1r2]); } else { if (t[r1r2]<1.0) ratio = stat*t[r1r2]; /* else ratio = ratio; */ } onesided[r1] *= ratio; onesided[r2] *= ratio; } ones = stat = 1.0; for (r1=0; r1= 1.0-FUZZ) yi_rowsim[r1] += wt; if (onesided[r1] >= 1.0-FUZZ) yi_rowint(r1) += wt; stat *= sim[r1]; ones *= onesided[r1]; } if (stat <= 1.0+FUZZ) yi_mat[0] += wt; if (stat >= 1.0-FUZZ) yi_mat[1] += wt; if (ones >= 1.0-FUZZ) yi_mat[2] += wt; xi += wt; num++; } /* SDSTAT At each SD step, compute the weighted sum of squares, estimate the sd of each element (using the ratio estimator formulas), and return the largest entry. */ reals sdstat(void) { reals r,e,err; int r1r2; err = 0; /* largest error */ sumxi += xi; sumxi2 += xi*xi; for (r1r2=0; r1r20) { r = (sumyi[r1r2]/(sumxi+improved_sumxi)); e = (sumyi2[r1r2] - 2*r*sumxiyi[r1r2] + r*r*sumxi2) / (sumxi * sumxi); } if (e>err) err=e; } memset(yi,0,(size)*sizeof(*yi)); xi = 0; if (err<0) return 196.0*sqrt(-err); /* just to avoid roundoff errors */ else return 196.0*sqrt( err); /* plus mius of % conf interval */ } /*--------------------------------------------------------- IMPROVESTAT This routine adds the yi's generated during the nskip steps into sumyi and keeps a separate improved_sumxi. The sumyi is not used by the sd calculations, so the estimate of the variance in unaffected by this. */ void improvestat() { int r1r2; for (r1r2=0; r1r21) { if (**(++argv)=='-') { s=(*argv)+1; t=strchr(s,'=')+1; switch(*s) { case 't': case 'T': *errtol = atof(t); break; case 'l': case 'L': *nlimit = atol(t); break; case 'a': case 'A': *nskip = atoi(t); break; case 's': case 'S': *nsd = atoi(t); break; case 'm': case 'M': *nmu = atoi(t); break; case 'd': case 'D': if ((*t=='s') || (*t=='S')) *step = &randomsparse; else if ((*t=='d') || (*t=='D')) *step = &randomdense; else { printf("Incorrect -d switch"); exit(1); } break; case 'h': case 'H': printf( "Usage: GRAFWALK [switches] datafile\n" "\n" "Output is written to datafile.out\n" "The switches are optional (can be abbreviated by the first letter):\n" "\n" " These switches set the stopping criterea:\n" " -tol=# Stop once the largest 95%% Conf Int for p-values <= #%%\n" " -limit=# Stop after # SD samples = limit*(sd+auto) samples.\n" " (0 means don't stop)\n" " These affect the estimation of the standard deviations\n" " -auto=# After each SD sample, skip # samples for autocorrelation.\n" " -sd=# Aggregate # samples to form one sample for SD computation.\n" " These affect speed, not statistical properties. Experiment with them\n" " to find the best before any long run.\n" " -m=# One sample will be taken every # steps.\n" " -density=dense Faster for matrices with many zeros and ones.\n" " -density=sparse Faster for matrices with mostly zeros or ones.\n" "\n" "Default: -t=0.1 -l=0 -a=rows*cols/m -s=100 -m=rows*cols/5+5\n" " a matrix is considered sparse it is less than 0.25 full\n" ); exit(1); default: printf("improper switch %s",*argv); exit(1); } } else name=*argv; } in=fopen(name,"rt"); if (in==NULL) { printf("Couldn't open data file [%s] for input\n" "Enter data from the terminal\n",name); in=stdin; *out=stdout; } else { strcpy(oname,name); dot=strchr(oname,'.'); if (dot==NULL) strcat(oname,".out"); else strcpy(dot+1,"out"); *out=fopen(oname,"wt"); if (*out==NULL) *out=stdout; } initialize(in); if (*nmu<0) *nmu = ((long int) rows*cols)/5 + 5; if (*nskip<0) *nskip = ((long int) rows*cols*5)/(*nmu); if (*step==NULL) { density=(rows*cols)/(reals)sum; if (density>0.5) density = 1-density; *step = (density<0.25) ? randomsparse : randomdense; } fprintf(*out,"GRAFWALK\n" "\n" "Stopping criterea:\n" " Stop once all 95%% confidence intervals for p-values are\n" " smaller than +/-%.4f%%.\n" " Stop after %ld SD samples.\n" "\n" "Calculation of the standard deviations:\n" " Assume there is negligible autocorrelation between observations\n" " that are %d samples apart.\n" " Aggregate %d samples to form one sample for SD computation.\n" "\n" "Computational speed parameters:\n" " One sample will be taken every %d steps.\n" " Method: %s matrices.\n" "\n", *errtol, *nlimit, *nskip, *nsd, *nmu, (*step == &randomsparse) ? "sparse" : "dense"); if (*nlimit==0) *nlimit=LONG_MAX; fflush(*out); } int main(int argc,char *argv[]) { int nmu, nsd, nskip; long nlimit; reals errtol; int i,j; long k; char *name; FILE *out; reals err, dif, pct; time_t time0; void (*step)(); parseargs(argc, argv, &nmu, &nsd, &nskip, &nlimit, &errtol, &out, &step); inistat(); inirand(); mustat(); time(&time0); for (k=0; k 0) dif = difftime( time(NULL), time0); fprintf(stderr, "\r%ld samples, %ld sd steps, err=%f, %.0f secs,", num-1, k+1, err, dif); pct = errtol/err; pct *= pct; if (pct < (k+1)/(reals)nlimit) pct = (k+1)/(reals) nlimit; fprintf(stderr, " %3.1f%% done ",pct*100); if ((err10)) break; } fprintf(out, "Simulation Results:\n" " A total of %ld steps were taken.\n" " The weighted average included %ld samples.\n" " The estimate of the SD was formed using %ld aggregated averages.\n" " The largest 95%% confidence interval is +/- %f\n" " Elapsed time: %.0lf seconds.\n" "\n", (num-1)*nmu, num-1, k, err, difftime(time(NULL),time0) ); printestimates(out); return 0; }