Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

graphbuilder.cc

Go to the documentation of this file.
00001 //** @file graphbuilder.cc */
00002 /* 
00003  * Copyright (C) 2002 Laird Breyer
00004  *  
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018  * 
00019  * Author:   Laird Breyer <laird@lbreyer.com>
00020  */
00021 
00022 #include "graphbuilder.h"
00023 
00024 #define Mb 1048576L
00025 
00026 /**
00027  * This constructor allocates approximately (smem+jmem) Mb for the trie and
00028  * nmem Mb for the nodetable.
00029  */
00030 GraphBuilder :: GraphBuilder(int smem, int jmem, int nmem, int lmem, bool sl) {
00031 
00032   trie = new Trie((smem * Mb), (jmem * Mb)/sizeof(SimpleHashPair<char*>)); 
00033 
00034   // this allows us to reference a WebNode by its URL
00035   long numels = (nmem * Mb)/sizeof(SimpleHashPair<WebNodePtr>);
00036   nodetable = new SimpleWebNodePtrHashTable(numels);
00037 
00038   // this is the web link graph
00039   graph = new WebLinkGraph();
00040   graph_is_docked = true;
00041 
00042   // these are used in constructing the nodes
00043   urlfilter = new URLFilter(true); // true means remove suffixes
00044   linkset = new RawLinkSet;
00045 
00046   curdoc = NULL;
00047   doc_url[0] = 0;
00048   doc_alias[0] = 0;
00049   curdoc_baseurl.Clear();
00050 
00051   // some statistics
00052   stats.heap_used_webnodes = 0;
00053 
00054   stats.cumulative_tolinks = 0;
00055   stats.cumulative_fromlinks = 0;
00056   stats.cumulative_dangling = 0;
00057   stats.cumulative_leaflinks = 0;
00058 
00059   stats.nodetable_insertions = 0;
00060   stats.nodetable_alias_insertions = 0;
00061 
00062   stats.lowid = kint32max;
00063   stats.highid = 0;
00064   stats.lowdate = kuint16max;
00065   stats.highdate = 0;
00066 
00067   flags.show_links = sl;
00068   flags.leaftable_memory = lmem;
00069 
00070 }
00071 
00072 GraphBuilder :: ~GraphBuilder() {
00073   // don't delete graph!
00074   delete linkset;
00075   delete nodetable;
00076   delete trie;
00077   if( leaftable ) { delete leaftable; }
00078 }
00079 
00080 void GraphBuilder :: StatisticsMem(ostream& o) {
00081 
00082   trie->Statistics(o);
00083 
00084   o << "\nTotal number of insertions in nodetable: " << stats.nodetable_insertions << endl;
00085   o << "Total number of aliases in nodetable:    " << stats.nodetable_alias_insertions << endl;
00086   o << "Heap needed for nodetable (bytes):       " << ((stats.nodetable_insertions + stats.nodetable_alias_insertions) * sizeof(SimpleHashPair<WebNodePtr>))<< endl;
00087   o << "Heap reserved for nodetable (bytes):     " << (nodetable->Size() * sizeof(SimpleHashPair<WebNodePtr>)) << endl;
00088 
00089   o << "\nMinimum size of each webnode (bytes):    " << sizeof(WebNode) << endl;
00090   o << "Heap needed for all webnodes (bytes):    " << stats.heap_used_webnodes << endl;
00091   o << "Unallocated webnodes (bytes):            " << WebNode::FreeBlocks() << endl;
00092   o << "Unallocated tolinks (bytes):             " << MemPool<LinkStruct>::FreeBlocks1() + MemPool<LinkStruct>::FreeBlocks2() << endl;
00093 }
00094 
00095 void GraphBuilder :: StatisticsGraph(ostream& o) {
00096   o << "Total number of nodes in web graph:      " << graph->size() << endl;
00097   o << "Lowest ID number in web graph:           " << stats.lowid << endl;
00098   o << "Highest ID number in web graph:          " << stats.highid << endl;
00099   o << "Lowest date in web graph:                " << stats.lowdate << endl;
00100   o << "Highest date in web graph:               " << stats.highdate << endl;
00101 
00102   o << "\nAverage number of tolinks/node:          " << static_cast<double>(stats.cumulative_tolinks)/graph->size() << endl;
00103   o << "Average number of dangling tolinks/node: " << static_cast<double>(stats.cumulative_dangling)/graph->size() << endl;
00104   o << "Average number of fromlinks/node:        " << static_cast<double>(stats.cumulative_fromlinks)/graph->size() << endl;
00105   o << "Average number of leaflinks/node:        " << static_cast<double>(stats.cumulative_leaflinks)/graph->size() << endl;
00106 }
00107 
00108 /// Creates a new WebNode on the heap and assigns anothernode as a handle
00109 void GraphBuilder :: NodeInitialize(uint32 idno) {
00110 
00111   assert( !curdoc );
00112 
00113   curdoc = new WebNode(idno);
00114   doc_url[0] = 0;
00115   doc_alias[0] = 0;
00116   curdoc_baseurl.Clear();
00117 
00118   //another_merge = false;
00119 
00120   stats.lowid = min(stats.lowid, idno);
00121   stats.highid = max(stats.highid, idno);
00122 
00123 }
00124 
00125 /// Places the WebNode handled by anothernode into the graph and clears anothernode.
00126 /**
00127  * The WebNode is pushed at the front of the graph. Since node id's are given sequentially
00128  * in increasing order, the graph will contain nodes with decreasing id sequence. This ordering
00129  * should not be tampered with, as it is used by Talker().
00130  */
00131 void GraphBuilder :: NodeLaunch() {
00132 
00133    assert( curdoc );
00134 
00135    stats.lowdate = (curdoc->Date() > 0) ? 
00136      min(stats.lowdate, curdoc->Date()) : stats.lowdate;
00137    stats.highdate = max(stats.highdate, curdoc->Date());
00138 
00139    graph->push_front(curdoc);
00140    linkset->clear();
00141 
00142    curdoc = NULL;
00143    doc_url[0] = 0;
00144    doc_alias[0] = 0;
00145    curdoc_baseurl.Clear();
00146    //another_merge = false;
00147 }
00148 
00149 /// Sets anothernode's date.
00150 void GraphBuilder :: NodeSetDate(unsigned short adate) {
00151   curdoc->SetDate(adate);
00152 }
00153 
00154 /// Inserts the anchor links contained in linkset into anothernode's fromlinks array.
00155 /**
00156  * Note that at this stage, all the links are pointer differences into the trie.
00157  */
00158 void GraphBuilder :: NodeInsertLinks() {
00159   curdoc->InsertRawLinks(linkset);
00160 }
00161 
00162 /// Copies the current document's URL into another_url, and places it into the trie.
00163 /**
00164  * The copy is necessary so that later calls to FormatURL() can use another_url
00165  * to complete anchor link URLs, whenever those are incomplete.
00166  */
00167 void GraphBuilder :: NodeSetURL(const char *docurl, const char *aliasurl) {
00168 
00169   docurl__ = docurl;
00170   aliasurl__ = aliasurl;
00171 
00172   assert( !strlen(doc_url) );
00173 
00174   // format the document's url for insertion in the trie
00175   // formatURL returns a string whose length is guaranteed less than STRINGBUF_LEN2
00176   ContentType contype;
00177   strcpy(doc_url, urlfilter->FormatURL(docurl, strlen(docurl), NULL, &contype));
00178 
00179   // insert the url both in trie and the nodetable
00180   // in case of duplicates, exchange the curdoc pointer with 
00181   // a pointer to the already existing WebNode
00182   {
00183     ptrdiff_t key = trie->InsertURL(urlfilter->CompressURL(urlfilter->DeindexURL(doc_url)));
00184 
00185     WebNodePtr w = nodetable->Find(key);
00186 
00187     if( !w ) {
00188 
00189       assert(curdoc);
00190 
00191       nodetable->Insert(key, curdoc);
00192 
00193       stats.nodetable_insertions++;
00194     } else {  // this node was already inserted
00195       cerr << "warning: aliased or duplicate node [" << doc_url << "]" << endl;
00196       // this is called at the beginning when the link pointers haven't
00197       // been allocated yet
00198       delete curdoc; // rid of current node
00199       curdoc = w; // replace with existing one with same URL
00200       //another_merge = true; // make sure new links are added to old ones
00201     }
00202   }
00203 
00204   // compute the current document's base URL address components
00205   // this is used when formatting anchor links.
00206   urlfilter->ParseURL(doc_url, curdoc_baseurl.scheme, curdoc_baseurl.netloc,
00207                       curdoc_baseurl.query, curdoc_baseurl.params, curdoc_baseurl.path);
00208 
00209   // this will make aliasurl point to curdoc also
00210   if( aliasurl ) {
00211     strcpy(doc_alias, urlfilter->FormatURL(aliasurl, strlen(aliasurl), NULL, &contype));
00212 
00213     ptrdiff_t key = trie->InsertURL(urlfilter->CompressURL(urlfilter->DeindexURL(doc_alias)));
00214 
00215     WebNodePtr w = nodetable->Find(key);
00216     if( !w ) {
00217       assert(curdoc);
00218       nodetable->Insert(key, curdoc);
00219       stats.nodetable_alias_insertions++;
00220     } else {
00221       // do nothing
00222       if(w != curdoc) {
00223         cerr << "warning: alias points to a different node. Dataset inconsistent? [" << doc_alias << "]" << endl;
00224         // policy: we ignore the error
00225       };
00226     }
00227   }
00228 
00229   if( flags.show_links ) {
00230     cout << doc_url << endl;
00231   }
00232 }
00233 
00234 const char * GraphBuilder :: NodeGetURL_() {
00235   return docurl__;
00236 }
00237 
00238 const char * GraphBuilder :: NodeGetAlias_() {
00239   return aliasurl__;
00240 }
00241 
00242 const char * GraphBuilder :: NodeGetURL() {
00243   return doc_url;
00244 }
00245 
00246 const char * GraphBuilder :: NodeGetAlias() {
00247   return doc_alias;
00248 }
00249 
00250 const uint32 GraphBuilder :: NodeGetID() {
00251   return curdoc->ID();
00252 }
00253 
00254 
00255 const uint16 GraphBuilder :: NodeGetDate() {
00256   return curdoc->Date();
00257 }
00258 
00259 /// Here we walk through all the nodes and change the
00260 /// links so that they no longer require the trie.
00261 WebLinkGraph* GraphBuilder :: UndockWebGraph() {
00262   cerr << "info: now undocking webgraph" << endl;
00263   if( graph_is_docked ) {
00264     // zeroth pass. Normalize tolinks and calculate array sizes for fromlinks
00265     for(WebLinkGraph::iterator i = graph->begin(); i != graph->end(); i++) {
00266       assert(*i);
00267       // convert ptrdiff_t to webnode* for all valid links
00268       // also initializes the valid link count
00269       (*i)->NormalizeRawLinks(nodetable);
00270      }
00271 
00272     // first pass. Calculate sizes of fromlinks arrays
00273      for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00274        assert(*i);
00275       for(int k = 0; k < (*i)->NumberOfValidToLinks(); k++) {
00276         assert((*i)->ValidToLink(k));
00277         ((*i)->ValidToLink(k))->IncrementNumberOfFromLinks();
00278       }
00279     }
00280 
00281     // second pass. Insert fromlinks and calculate statistics
00282     for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00283       assert(*i);
00284       for(int k = 0; k < (*i)->NumberOfValidToLinks(); k++) {
00285         assert((*i)->ValidToLink(k));
00286         ((*i)->ValidToLink(k))->AppendFromLink(*i);
00287       }
00288       stats.heap_used_webnodes += (*i)->RealSize();
00289       stats.cumulative_dangling += (*i)->NumberOfDanglingToLinks();
00290       stats.cumulative_fromlinks += (*i)->NumberOfValidFromLinks();
00291       stats.cumulative_leaflinks += (*i)->NumberOfLeafLinks();
00292       stats.cumulative_tolinks += (*i)->NumberOfValidToLinks();
00293     }
00294 
00295 #ifndef NDEBUG
00296     // third pass. check that fromlinks work properly
00297     cerr << "debug: check that fromlinks work properly" << endl;
00298     for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00299       for(int k = 0; k < (*i)->NumberOfValidFromLinks(); k++) {
00300         assert((*i)->ValidFromLink(k));
00301       }
00302     }
00303 #endif
00304     graph_is_docked = false;
00305   }
00306   cerr << "info: webgraph is now undocked" << endl;
00307   return graph;
00308 }
00309 
00310 void GraphBuilder :: UpdateLeafLinks() {
00311   assert(leaftable);
00312   stats.cumulative_dangling = 0;
00313   stats.cumulative_leaflinks = 0;
00314   for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00315     assert(*i);
00316     (*i)->UpdateLeafLinks(leaftable);
00317     stats.cumulative_leaflinks += (*i)->NumberOfLeafLinks();
00318     stats.cumulative_dangling += (*i)->NumberOfDanglingToLinks();
00319   }
00320 
00321 #ifndef NDEBUG
00322     // third pass. check that leaflinks work properly
00323     cerr << "debug: check that leaflinks work properly" << endl;
00324     for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00325       for(int k = 0; k < (*i)->NumberOfLeafLinks(); k++) {
00326         assert((*i)->ValidLeafLink(k));
00327         assert((*i)->ValidLeafLink(k)->OccupationCount() == 0);
00328       }
00329     }
00330 #endif
00331 
00332 }
00333 
00334 /// finds an existing WebNode from a URL. Returns NULL if not found.
00335 WebNode* GraphBuilder :: FindWebNode(const char *url) {
00336   ptrdiff_t key = trie->FindURL(urlfilter->CompressURL(urlfilter->DeindexURL(url)));
00337   if( key > -1 ) {
00338     WebNodePtr w = nodetable->Find(key);
00339     if( w ) {
00340       return w;
00341     }
00342   }
00343   return NULL;
00344 }
00345 
00346 /// finds an existing LeafNode key into the leaftable from a URL. Returns -1 if not found.
00347 const ptrdiff_t GraphBuilder :: FindLeafNodeKey(const char *url) {
00348   return trie->FindURL(urlfilter->CompressURL(urlfilter->DeindexURL(url)));
00349 }
00350 
00351 void GraphBuilder :: SetupLeafTable() {
00352   if( !leaftable ) {
00353     leaftable = new SimpleLeafNodePtrHashTable((flags.leaftable_memory * Mb)/sizeof(SimpleHashPair<LeafNodePtr>));
00354   }
00355   leaftable->Clear();
00356 }
00357 
00358 void GraphBuilder :: AddLeaf(const ptrdiff_t key, LeafNodePtr leaf) {
00359   assert(leaftable);
00360   if( !leaftable->Find(key) ) {
00361     leaftable->Insert(key, leaf);
00362   } else {
00363     cerr << "warning: duplicate leaf ignored" << endl;
00364   }
00365 
00366   stats.lowdate = (leaf->Date() > 0) ? 
00367     min(stats.lowdate, leaf->Date()) : stats.lowdate;
00368   stats.highdate = max(stats.highdate, leaf->Date());
00369 }
00370 
00371 void GraphBuilder :: TrieInsertLinkURL(const char *url) {
00372   ptrdiff_t key = trie->InsertURL(urlfilter->CompressURL(urlfilter->DeindexURL(url)));
00373   assert(*(trie->bigs + key) == 0);
00374   linkset->insert(key);
00375 }
00376 

Generated on Wed May 29 11:37:14 2002 for MarkovPR by doxygen1.2.15