Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

webnode.h

Go to the documentation of this file.
00001 /** @file webnode.h */
00002 /* 
00003  * Copyright (C) 2002 Laird Breyer
00004  *  
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018  * 
00019  * Author:   Laird Breyer <laird@lbreyer.com>
00020  */
00021 #ifndef _WEB_NODE_H_
00022 #define _WEB_NODE_H_
00023 #include "mempool.h"
00024 #include "simplehash.h"
00025 #include "leafnode.h"
00026 #include <stdexcept>
00027 #include <set>
00028 
00029 /// Encapsulates a web document.
00030 class WebNode; // defined below
00031 /// All WebNodes are allocated on the heap as WebNodePtrs.
00032 typedef WebNode* WebNodePtr;
00033 
00034 /// A pointer to a web document/URL.
00035 /**
00036  * During the construction of the web graph,
00037  * document links are of type ptrdiff_t, but after
00038  * the WebLinkGraph is undocked, they are of type WebNodePtr
00039  */
00040 struct LinkStruct {
00041   union {
00042     ptrdiff_t pointer_diff;
00043     WebNodePtr webnode_ptr;
00044     LeafNodePtr leafnode_ptr;
00045   };
00046 };
00047 
00048 /// A memory management wrapper around LinkStruct.
00049 typedef MemPoolObject<LinkStruct> Link;
00050 
00051 /// A scratch structure used by samplers.
00052 /// Each sampler should cast it into something useful
00053 typedef uint32 ScratchStruct;
00054 
00055 #define Mb 1048576L
00056 #define WEBNODE_MEMPOOL_DELTA ((1 * Mb)/sizeof(WebNode))
00057 #define LINK_MEMPOOL_DELTA ((1 * Mb)/sizeof(Link))
00058 
00059 #define TAG_NUMBER_OF_BITS 16
00060 
00061 /// Contains all the information about a web document.
00062 struct WebNodeStruct {
00063   // the ordering here is important due to alignment issues
00064   Link *tolinks;
00065   Link *fromlinks;
00066   uint32 id;
00067   uint32 occupation_count;
00068   uint16 date;
00069   uint16 num_tolinks;
00070   uint16 num_fromlinks;
00071   uint16 num_valid_tolinks;
00072   uint16 num_leaflinks;
00073   uint16 tag;
00074   ScratchStruct scratch;
00075 };
00076 
00077 /// Used by GraphBuilder to store uniquely the anchor links
00078 /// and then insert them into the WebNode.
00079 typedef set<ptrdiff_t> RawLinkSet; //,CharPtrComparisonFunction> RawLinkSet;
00080 
00081 /// Encapsulates a web document.
00082 /**
00083  * Every web document read by the ripper is represented by
00084  * a WebNode. The construction of a WebNode is complicated,
00085  * and is done by GraphBuilder, which also links the nodes
00086  * into a WebLinkGraph.
00087  * All the data members are defined as a WebNodeStruct, WebNode
00088  * is really just a wrapper for WebNodeStruct to handle custom
00089  * memory management.
00090  * The class inherits memory management from MemoryPooled<T>.
00091  */
00092 class WebNode: public MemoryPooled<WebNodeStruct> {
00093  public:
00094 
00095   WebNode(uint32 idno);
00096   void InsertRawLinks(RawLinkSet *s);
00097   void NormalizeRawLinks(SimpleHashTable<WebNodePtr> *h);
00098   size_t RealSize();
00099   // inline functions
00100   int NumberOfValidToLinks() 
00101     { return data.num_valid_tolinks;}
00102   int NumberOfDanglingToLinks() 
00103     { return (data.num_tolinks - data.num_valid_tolinks - data.num_leaflinks);}
00104   int NumberOfValidFromLinks() 
00105     { return data.num_fromlinks;}
00106   int NumberOfLeafLinks() 
00107     { return data.num_leaflinks;}
00108 
00109   void IncrementNumberOfFromLinks() 
00110     { data.num_fromlinks++; }
00111 
00112   void AppendFromLink(WebNodePtr anothernode) throw (overflow_error);
00113 
00114   void UpdateLeafLinks(SimpleLeafNodePtrHashTable *leaftable);
00115 
00116   void SetDate(uint16 adate);
00117 
00118   WebNodePtr ValidToLink(int k) 
00119     {
00120       assert(k < data.num_valid_tolinks);
00121       assert(data.tolinks[k].data.webnode_ptr);
00122       return data.tolinks[k].data.webnode_ptr;
00123     }
00124   WebNodePtr ValidFromLink(int k) 
00125     {
00126       assert(data.fromlinks[k].data.webnode_ptr);
00127       return data.fromlinks[k].data.webnode_ptr;
00128     }
00129   LeafNodePtr ValidLeafLink(int k)
00130     {
00131       assert(k < data.num_leaflinks);
00132       assert(data.tolinks[data.num_valid_tolinks + k].data.leafnode_ptr);
00133       return data.tolinks[data.num_valid_tolinks + k].data.leafnode_ptr;
00134     }
00135   LeafNodePtr ValidLeafLinkDirectly(int k) ///< Same as ValidLeafLink but saves a +/- in -O3
00136     {
00137       assert(k >= data.num_valid_tolinks);
00138       assert(k < data.num_valid_tolinks + data.num_leaflinks);
00139       assert(data.tolinks[k].data.leafnode_ptr);
00140       return data.tolinks[k].data.leafnode_ptr;
00141     }
00142 
00143   uint32 ID() 
00144     { return data.id; }
00145   uint16 Date() 
00146     { return data.date; }
00147 
00148   void ClearTag() 
00149     { data.tag = 0; }
00150   void SetTag(int k) 
00151     {
00152       assert(k < TAG_NUMBER_OF_BITS);
00153       data.tag |= (1<<k); // sets the kth bit
00154     }
00155   bool Tagged(int k)
00156     {
00157       return ((data.tag & (1<<k)) != 0);
00158     }
00159 
00160   void ClearOccupationCount() 
00161     { 
00162       data.occupation_count = 0;
00163       data.scratch = 0;
00164     }
00165   uint32 OccupationCount() 
00166     { return data.occupation_count; }
00167   void IncrementOccupationCount() 
00168     { data.occupation_count++; }
00169   void IncrementOccupationCount(int c) 
00170     { data.occupation_count += c; }
00171 
00172   ScratchStruct Scratch() 
00173     { return data.scratch; }
00174   void SetScratch(ScratchStruct ascratch) 
00175     { data.scratch = ascratch; }
00176 
00177  private:
00178   static MemPool<LinkStruct> global_link_pool;
00179 };
00180 
00181 #endif

Generated on Wed May 29 11:37:16 2002 for MarkovPR by doxygen1.2.15