00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "sampler.h"
00023
00024 #include <math.h>
00025
00026 #include <gsl/gsl_randist.h>
00027
00028
00029 WebSampler :: WebSampler(WebLinkGraph* g) throw (exception) {
00030 assert(g);
00031
00032 graph = g;
00033 number_of_nodes = graph->size();
00034 allocation_list = new int32[number_of_nodes];
00035 memset(allocation_list, 0, number_of_nodes * sizeof(int32));
00036
00037 ClearCounts();
00038 ClearScratch();
00039
00040 r = gsl_rng_alloc(gsl_rng_taus);
00041 if( !r ) {
00042 cerr << "error: couldn't initialize the random number generator" << endl;
00043 throw exception();
00044 }
00045
00046 eps = 1.0;
00047 xleaf = NULL;
00048 }
00049
00050
00051 void WebSampler :: ClearCounts() {
00052 for(WebNodeList::iterator i = graph->begin();
00053 i != graph->end(); i++ ) {
00054 (*i)->ClearOccupationCount();
00055 }
00056 last_run_size = 0;
00057 last_tagged_run_size = 0;
00058 }
00059
00060
00061 void WebSampler :: ClearScratch() {
00062 for(WebNodeList::iterator i = graph->begin();
00063 i != graph->end(); i++ ) {
00064 (*i)->SetScratch(0);
00065 }
00066 }
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076 void WebSampler :: PrintCounts(ostream& o) {
00077 char buf[1024];
00078 o << "# Sampler used: " << Name(buf) << endl;
00079 o << "# Number of samples produced: " << last_run_size << endl;
00080 o << "# node | occupation_count | 95% conf. interval" << endl;
00081 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00082 double k = (double)(*i)->OccupationCount();
00083 double eps = 1.96 * sqrt(k * (last_run_size - k)/last_run_size);
00084 o << (*i)->ID() << " "
00085 << (*i)->OccupationCount() << " "
00086 << max(static_cast<int>(floor(k - eps)),0) << " "
00087 << static_cast<unsigned int>(ceil(k + eps)) << endl;
00088 }
00089 }
00090
00091
00092
00093 void WebSampler :: PrintTagCounts(ostream& o) {
00094 char buf[1024];
00095 o << "# Sampler used: " << Name(buf) << endl;
00096 o << "# Number of (tagged) samples produced: " << last_tagged_run_size << endl;
00097 o << "# node | occupation_count | 95% conf. interval" << endl;
00098 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00099 if( (*i)->Tagged(0) ) {
00100 double k = (double)(*i)->OccupationCount();
00101 double eps = 1.96 * sqrt(k * (last_tagged_run_size - k)/last_tagged_run_size);
00102 o << (*i)->ID() << " "
00103 << (*i)->OccupationCount() << " "
00104 << max(static_cast<int>(floor(k - eps)),0) << " "
00105 << static_cast<unsigned int>(ceil(k + eps)) << endl;
00106 }
00107 }
00108 }
00109
00110
00111 void WebSampler :: ClearAllocForward() {
00112 assert(number_of_nodes > 0);
00113 memset(allocation_list, 0, number_of_nodes * sizeof(int32));
00114 }
00115
00116
00117 void WebSampler :: IncrementAllocForward(uint32 k, int32 num) {
00118 assert( static_cast<unsigned int>(k) < number_of_nodes);
00119 allocation_list[k] += num;
00120 }
00121
00122 void WebSampler :: SimulateAllocForward() {
00123
00124
00125 uint32 j = 0;
00126 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++, j++ ) {
00127 for(int k = allocation_list[j]; k > 0; k--) {
00128
00129 WebNodePtr x = (*i);
00130 assert(x);
00131 while( (gsl_rng_uniform(r) > eps) &&
00132 x && (x->NumberOfValidToLinks() > 0)) {
00133 x = QEvolveFrom(x);
00134 }
00135
00136 if( x ) {
00137 x->IncrementOccupationCount();
00138 } else if( xleaf ) {
00139 xleaf->IncrementOccupationCount();
00140 }
00141
00142 }
00143 }
00144 }
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154 void WebSampler :: SimulateForward(long n) {
00155 assert(graph);
00156 assert(allocation_list);
00157 assert( n > 0 );
00158
00159 assert(number_of_nodes > 0);
00160
00161 ClearAllocForward();
00162
00163
00164 for(long i = 0; i < n; i++)
00165 allocation_list[(int32)gsl_rng_uniform_int(r, number_of_nodes)]++;
00166
00167 SimulateAllocForward();
00168
00169 last_run_size += n;
00170 }
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183 void WebSampler :: TaggedSimulateForward(long n, const uint32 *fromsetsize) {
00184 assert(graph);
00185 assert(allocation_list);
00186 assert( n > 0 );
00187
00188 assert(number_of_nodes > 0);
00189
00190
00191
00192 long num_less_than_K = 0;
00193 long num_greater_than_K = 0;
00194 double threshold = pow(1 - eps, TAG_NUMBER_OF_BITS);
00195 for(long i = 0; i < n; i++) {
00196 if( gsl_rng_uniform(r) > threshold ) {
00197 num_less_than_K++;
00198 } else {
00199 num_greater_than_K++;
00200 }
00201 }
00202
00203
00204 {
00205 cerr << "info: tagged simulation uses " << num_greater_than_K << " global candidates ("
00206 << (100 * num_greater_than_K)/n << "%)"<< endl;
00207 long num_accepted = 0;
00208
00209 memset(allocation_list, 0, number_of_nodes * sizeof(int32));
00210 for(long i = 0; i < num_greater_than_K; i++)
00211 allocation_list[(int32)gsl_rng_uniform_int(r, number_of_nodes)]++;
00212
00213
00214
00215 uint32 j = 0;
00216 for(WebNodeList::iterator i = graph->begin();
00217 i != graph->end(); i++, j++ ) {
00218 for(int k = allocation_list[j]; k > 0; k--) {
00219
00220 WebNodePtr x = (*i);
00221 assert(x);
00222
00223 for(int t = 0; (t < TAG_NUMBER_OF_BITS) && (x->NumberOfValidToLinks() > 0); t++) {
00224 x = QEvolveFrom(x);
00225 }
00226
00227 while( (gsl_rng_uniform(r) > eps) && (x->NumberOfValidToLinks() > 0)) {
00228 x = QEvolveFrom(x);
00229 }
00230
00231 if( x->Tagged(0) ) {
00232 x->IncrementOccupationCount();
00233 last_tagged_run_size++;
00234 num_accepted++;
00235 }
00236 }
00237 }
00238 cerr << "info: tagged simulation has produced " << num_accepted << " global samples" << endl;
00239 }
00240
00241
00242 {
00243 cerr << "info: tagged simulation uses " << num_less_than_K << " local candidates" << endl;
00244 long num_accepted = 0;
00245
00246
00247 for(int k = 0; k < TAG_NUMBER_OF_BITS; k++) {
00248 probabilities[k] = static_cast<double>(fromsetsize[k]) * eps * pow(1 - eps, k);
00249 allocated[k] = 0;
00250 fromsetsize_remaining[k] = fromsetsize[k];
00251 }
00252 gsl_ran_discrete_t *grd = gsl_ran_discrete_preproc(TAG_NUMBER_OF_BITS, probabilities);
00253 for(long l = 0; l < num_less_than_K; l++ ) {
00254
00255
00256 allocated[gsl_ran_discrete(r,grd)]++;
00257 }
00258 gsl_ran_discrete_free(grd);
00259
00260
00261 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++ ) {
00262 assert(*i);
00263 for(int k = 0; k < TAG_NUMBER_OF_BITS; k++) {
00264 if( (*i)->Tagged(k) ) {
00265
00266 fromsetsize_remaining[k]--;
00267 assert(fromsetsize_remaining[k] >= 0);
00268
00269
00270 double p = 1.0/static_cast<double>(fromsetsize_remaining[k]);
00271 unsigned int num_starts = gsl_ran_binomial(r, p, allocated[k]);
00272
00273
00274
00275
00276 for(uint32 c = 0; c < num_starts; c++) {
00277 WebNodePtr x = (*i);
00278 int s = k;
00279 while( (s > 0) && (x->Tagged(s)) ) {
00280
00281
00282 x = QEvolveFrom(x);
00283 s--;
00284 }
00285
00286 if( (s == 0) && (x->Tagged(0)) ) {
00287 x->IncrementOccupationCount();
00288 last_tagged_run_size++;
00289 num_accepted++;
00290 }
00291 }
00292
00293
00294
00295 allocated[k] -= num_starts;
00296 assert(allocated[k] >= 0);
00297 }
00298 }
00299 }
00300 cerr << "info: tagged simulation has produced " << num_accepted << " local samples" << endl;
00301 }
00302 }
00303
00304
00305
00306 PageRankSampler :: PageRankSampler(WebLinkGraph* agraph): WebSampler(agraph) {
00307 eps = 0.5;
00308 }
00309
00310 void PageRankSampler :: SetParameters(double epsilon) {
00311 assert(graph);
00312 assert( (epsilon > 0) && (epsilon < 1));
00313 eps = epsilon;
00314 ClearCounts();
00315 }
00316
00317
00318
00319 inline WebNodePtr PageRankSampler :: QEvolveFrom(WebNodePtr x) {
00320
00321 int nx = x->NumberOfValidToLinks() + x->NumberOfLeafLinks();
00322 if( nx == 0 ) {
00323 return x;
00324 }
00325
00326 register int where =
00327 gsl_rng_uniform_int(r, nx);
00328 if( where < x->NumberOfValidToLinks() ) {
00329 return x->ValidToLink(where);
00330 } else {
00331 xleaf = x->ValidLeafLinkDirectly(where);
00332 return NULL;
00333 }
00334
00335 }
00336
00337 char *PageRankSampler :: Name(char *buf) {
00338 sprintf(buf, "PageRank, epsilon = %f", eps);
00339 return buf;
00340 }
00341
00342
00343
00344 DateBiasedPageRankSampler :: DateBiasedPageRankSampler(WebLinkGraph* agraph): WebSampler(agraph) {
00345 eps = 0.5;
00346 lam = 0.01;
00347 daterange = kuint16max;
00348 lamhat = lam/daterange;
00349
00350
00351 assert(sizeof(DBScratchStruct) == sizeof(ScratchStruct));
00352 }
00353
00354 void DateBiasedPageRankSampler :: SetParameters(double epsilon, double lambda, uint16 dr) {
00355 assert(graph);
00356 assert( (epsilon > 0) && (epsilon < 1));
00357 eps = epsilon;
00358 lam = lambda;
00359 assert(dr > 0);
00360 daterange = dr;
00361 lamhat = lam/daterange;
00362 ClearCounts();
00363 }
00364
00365
00366
00367
00368 inline WebNodePtr DateBiasedPageRankSampler :: QEvolveFrom(WebNodePtr x) {
00369
00370 int nx = x->NumberOfValidToLinks() + x->NumberOfLeafLinks();
00371 if( nx == 0 ) {
00372 return x;
00373 }
00374
00375 register DBScratchStruct scratch(x->Scratch());
00376
00377 if( !scratch.ss ) {
00378 scratch.mass = 0.0;
00379 for(register int t = 0; t < x->NumberOfValidToLinks(); t++) {
00380 scratch.mass += exp(-lamhat * max(x->Date() - x->ValidToLink(t)->Date(), 0));
00381 }
00382 for(register int t = 0; t < x->NumberOfLeafLinks(); t++) {
00383 scratch.mass += exp(-lamhat * max(x->Date() - x->ValidLeafLink(t)->Date(), 0));
00384 }
00385 x->SetScratch(scratch.ss);
00386 }
00387
00388 register float tmass = 0.0;
00389
00390 for(register int t = 0; t < x->NumberOfValidToLinks(); t++) {
00391 register float xi = gsl_rng_uniform(r);
00392 register float p = exp(-lamhat * max(x->Date() - x->ValidToLink(t)->Date(),0) );
00393 if( p + tmass * xi >= scratch.mass * xi ) {
00394 return x->ValidToLink(t);
00395 } else {
00396 tmass += p;
00397 }
00398 }
00399
00400 for(register int t = 0; t < x->NumberOfLeafLinks(); t++) {
00401 register float xi = gsl_rng_uniform(r);
00402 register float p = exp(-lamhat * max(x->Date() - x->ValidLeafLink(t)->Date(),0) );
00403 if( p + tmass * xi >= scratch.mass * xi ) {
00404 xleaf = x->ValidLeafLink(t);
00405 return NULL;
00406 } else {
00407 tmass += p;
00408 }
00409 }
00410
00411
00412
00413 cerr << "warning: precision difficulties in DateBiasedPageRankSampler::QEvolveFrom()" << endl;
00414
00415 register int where =
00416 gsl_rng_uniform_int(r, nx);
00417 if( where < x->NumberOfValidToLinks() ) {
00418 return x->ValidToLink(where);
00419 } else {
00420 xleaf = x->ValidLeafLinkDirectly(where);
00421 return NULL;
00422 }
00423
00424 }
00425
00426
00427 char *DateBiasedPageRankSampler :: Name(char *buf) {
00428 sprintf(buf, "DateBiasedPageRank, epsilon = %f, lambda = %f", eps, lam);
00429 return buf;
00430 }
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484 TruncatedKleinbergSampler :: TruncatedKleinbergSampler(WebLinkGraph* agraph): WebSampler(agraph) {
00485 eps = 0.5;
00486 }
00487
00488 void TruncatedKleinbergSampler :: SetParameters(double epsilon, ktype what) {
00489 assert(graph);
00490 assert( (epsilon > 0) && (epsilon < 1));
00491 eps = epsilon;
00492 which = what;
00493 ClearCounts();
00494 }
00495
00496
00497
00498 inline WebNodePtr TruncatedKleinbergSampler :: QEvolveFrom(WebNodePtr x) {
00499
00500 int nx;
00501
00502 switch(which) {
00503 case hubs:
00504 nx = x->NumberOfValidToLinks();
00505 if( nx > 0 ) {
00506
00507 register int where = gsl_rng_uniform_int(r, nx);
00508 x = x->ValidToLink(where);
00509
00510 nx = x->NumberOfValidFromLinks();
00511 if( nx > 0 ) {
00512 where = gsl_rng_uniform_int(r, nx);
00513 return x->ValidFromLink(where);
00514 } else {
00515 return x;
00516 }
00517 } else {
00518 return x;
00519 }
00520 break;
00521 case auth:
00522 nx = x->NumberOfValidFromLinks();
00523 if( nx > 0 ) {
00524
00525 register int where = gsl_rng_uniform_int(r, nx);
00526 x = x->ValidFromLink(where);
00527
00528 nx = x->NumberOfValidToLinks();
00529 if( nx > 0 ) {
00530 where = gsl_rng_uniform_int(r, nx);
00531 return x->ValidToLink(where);
00532 } else {
00533 return x;
00534 }
00535 } else {
00536 return x;
00537 }
00538 break;
00539 default:
00540 return x;
00541 }
00542
00543 }
00544
00545 char *TruncatedKleinbergSampler :: Name(char *buf) {
00546 sprintf(buf, "Truncated Kleinberg, epsilon = %f %s", eps,
00547 (which == hubs) ? "(hubs)" : "(auth)");
00548 return buf;
00549 }