Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

graphbuilder.h

Go to the documentation of this file.
00001 /** @file graphbuilder.h */
00002 /* 
00003  * Copyright (C) 2002 Laird Breyer
00004  *  
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018  * 
00019  * Author:   Laird Breyer <laird@lbreyer.com>
00020  */
00021 
00022 #ifndef _GRAPH_BUILDER_H_
00023 #define _GRAPH_BUILDER_H_
00024 #include "document.h"
00025 #include "webnode.h"
00026 #include "leafnode.h"
00027 #include "leafnode.h"
00028 #include "simplehash.h"
00029 #include "linkgraph.h"
00030 #include "trie.h"
00031 #include "urlfilter.h"
00032 
00033 #define SLASHBIT 0x80
00034 
00035 /// Hashtable used to look up WebNode objects from its URL in the trie.
00036 typedef SimpleHashTable<WebNodePtr> SimpleWebNodePtrHashTable;
00037 
00038 /// Builds the web link graph as an object of type WebLinkGraph.
00039 /**
00040  * Every WebNode is a separate document, and GraphBuilder handles
00041  * the connection of fromlinks and tolinks, and the discarding of
00042  * dangling links. 
00043  *
00044  * To do this, GraphBuilder must construct a trie containing all 
00045  * known document URLs. Once the WebLinkGraph is built, the trie
00046  * is "undocked" from the list. This allows the (substantial) memory
00047  * taken by the URL strings to be regained, at the cost of no longer 
00048  * being able to identity a WebNode by its document URL.
00049  */
00050 class GraphBuilder {
00051  public:
00052 
00053   GraphBuilder(int smem, int jmem, int nmem, int lmem, bool sl);
00054   ~GraphBuilder();
00055 
00056   void NodeInitialize(uint32 idno);
00057   void NodeSetURL(const char *docurl, const char *aliasurl);
00058   const char* NodeGetURL();
00059   const char* NodeGetAlias();
00060   const char* NodeGetURL_();
00061   const char* NodeGetAlias_();
00062   const uint32 NodeGetID();
00063   const uint16 NodeGetDate();
00064   void NodeSetDate(unsigned short aDate);
00065   void NodeInsertLinks();
00066   void NodeLaunch();
00067   URLComponents* NodeGetURLParts() 
00068     { return &curdoc_baseurl; }
00069   
00070   WebLinkGraph* UndockWebGraph();
00071 
00072   const char* FormatURL(const char *anurl, int anurl_len, ContentType *t) 
00073     { return urlfilter->FormatURL(anurl, anurl_len, &curdoc_baseurl, t); }
00074 
00075   void TrieInsertLinkURL(const char *url);
00076 
00077   WebNodePtr FindWebNode(const char *url);
00078 
00079   void SetupLeafTable();
00080   void AddLeaf(ptrdiff_t key, LeafNodePtr leaf);
00081   const ptrdiff_t FindLeafNodeKey(const char *url);
00082   void UpdateLeafLinks();
00083 
00084   void StatisticsMem(ostream& o);
00085   void StatisticsGraph(ostream& o);
00086 
00087   uint32 LowestID()
00088     { return stats.lowid; }
00089   uint32 HighestID()
00090     { return stats.highid; }
00091 
00092   uint16 LowestDate()
00093     { return stats.lowdate; }
00094   uint16 HighestDate()
00095     { return stats.highdate; }
00096 
00097   struct {
00098     bool show_links;
00099     int leaftable_memory;
00100   } flags;
00101 
00102  private:
00103 
00104   WebNode* curdoc;
00105 
00106   URLComponents curdoc_baseurl;
00107   char doc_url[STRINGBUF_LEN2+1];
00108   char doc_alias[STRINGBUF_LEN2+1];
00109 
00110   const char *docurl__;
00111   const char *aliasurl__;
00112   //bool another_merge; // for when we insert duplicate nodes 
00113   // store its decomposition for easy access
00114 
00115   WebLinkGraph *graph;
00116   bool graph_is_docked;
00117 
00118   Trie *trie;
00119   SimpleWebNodePtrHashTable *nodetable;
00120   SimpleLeafNodePtrHashTable *leaftable;
00121   URLFilter *urlfilter;
00122 
00123   RawLinkSet *linkset; 
00124 
00125   struct {
00126     uint32 heap_used_webnodes;
00127     uint32 cumulative_tolinks;
00128     uint32 cumulative_fromlinks;
00129     uint32 cumulative_leaflinks;
00130     uint32 cumulative_dangling;
00131     uint32 nodetable_insertions;
00132     uint32 nodetable_alias_insertions;
00133     uint32 lowid;
00134     uint32 highid;
00135     uint16 lowdate;
00136     uint16 highdate;
00137   } stats;
00138 
00139 };
00140 
00141 #endif

Generated on Wed May 29 11:37:14 2002 for MarkovPR by doxygen1.2.15