Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

trie.cc

Go to the documentation of this file.
00001 /** @file trie.cc 
00002  * This file contains the member function definitions
00003  * for the class Trie.
00004  */
00005 /* 
00006  * Copyright (C) 2002 Laird Breyer
00007  *  
00008  * This program is free software; you can redistribute it and/or modify
00009  * it under the terms of the GNU General Public License as published by
00010  * the Free Software Foundation; either version 2 of the License, or
00011  * (at your option) any later version.
00012  * 
00013  * This program is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  * 
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program; if not, write to the Free Software
00020  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00021  * 
00022  * Author:   Laird Breyer <laird@lbreyer.com>
00023  */
00024 
00025 #include "trie.h"
00026 #include <cstring>
00027 #include <cassert>
00028 
00029 Trie :: Trie(unsigned long slen, long jels) {
00030   // allocate bigstring
00031   slen_ = slen;
00032   bigs = new char[slen];
00033   memset(bigs, 0, slen);
00034   end_of_bigs = 0;
00035   // allocate the hashes
00036   jumptable = new SimpleCharPtrHashTable(jels);
00037 
00038   // clear some statistics
00039   stats_bigstring_insertions = 0;
00040   stats_jumptable_insertions = 0;
00041   stats_cumulative_string_size = 0;
00042 }
00043 
00044 void Trie :: Statistics(ostream& o) {
00045   o << "Total number of URL strings inserted:    " << stats_bigstring_insertions << endl;
00046   o << "Total original size of URLs inserted:    " << stats_cumulative_string_size << endl;
00047   o << "Heap needed for bigstring (bytes):       " << end_of_bigs << endl;
00048   o << "Heap reserved for bigstring (bytes):     " << slen_ << endl;
00049 
00050   o << "\nTotal number of insertions in jumptable: " << stats_jumptable_insertions << endl;
00051   o << "Heap needed for jumptable (bytes):       " << (stats_jumptable_insertions * sizeof(SimpleHashPair<char*>)) << endl;
00052   o << "Heap reserved for jumptable (bytes):     " << (jumptable->Size() * sizeof(SimpleHashPair<char*>)) << endl;
00053 }
00054 
00055 
00056 /// Searches for a URL within the trie. 
00057 /**
00058  * If found, returns a character pointer to the end of the URL string
00059  * within the trie, else returns -1.
00060  */
00061 // code lifted from InsertURL, someday this will have to be refactored
00062 ptrdiff_t Trie :: FindURL(const char *url) {
00063   char *p = bigs;
00064   const char *q = url;
00065 
00066   //  url = CompressURL(url);
00067 
00068   while( *q ) {
00069     // skip over common prefix
00070     while( (*q != 0) && ( *p == *q ) ) {p++; q++; };
00071 
00072     // either q string is empty, or p string is empty, or both
00073 
00074     if( (*q == 0) && (*p == 0) ) { 
00075 
00076       return p - bigs;
00077 
00078     } else if( *q == 0 ) { // a longer p string was inserted first
00079       // follow as many branches as possible
00080 
00081       char *nextp = jumptable->Find(p - bigs);
00082 
00083       while( nextp && (*nextp != 0) ) {
00084         p = nextp;
00085 
00086         nextp = jumptable->Find(p - bigs);
00087 
00088       }
00089 
00090       if( nextp && (*nextp  == 0) ) { // found the string
00091 
00092         return p - bigs;
00093 
00094       } else { // there are no further branches at p, 
00095 
00096         return -1; // string not found
00097 
00098       } 
00099     } else { // 0 != *q != *p
00100 
00101       char *nextp = jumptable->Find(p - bigs);
00102 
00103       if( nextp ) { // branch exists
00104         p = nextp;
00105         // and back to while
00106       } else { // no branch at p
00107 
00108         return -1; // string not found
00109 
00110       }
00111     }
00112   }
00113   
00114   return -1; // should not get to here
00115 }
00116 
00117 // char & ~SLASHBIT unsets branch bit
00118 // char | SLASHBIT sets branch bit 
00119 // char & SLASHBIT is true only if branch bit is set
00120 
00121 /// Inserts a URL string into the trie. 
00122 /**
00123  * Always returns a char pointer to the end of the inserted string.
00124  * The second argument is of urltype, and signifies that the 
00125  * URL that is being inserted is either the current 
00126  * document's url, or a url found in one of the 
00127  * current document's anchor tags.
00128  *
00129  * Note InsertURL assumes url is a pure ASCII string (7 bits).
00130  */
00131 ptrdiff_t Trie :: InsertURL(const char *url) throw (overflow_error) {
00132   char *p = bigs;
00133   const char *q = url;
00134   char *end_of_inserted_string = NULL;
00135 
00136   stats_cumulative_string_size += (strlen(url)+1);
00137 
00138   // DEBUG cout << (node ? "========== " : "- - - - - -") << url << endl;
00139 
00140   if( end_of_bigs + strlen(url) > slen_ ) {
00141     Statistics(cout);
00142     cerr << "d'oh: out of string memory" << endl;
00143     throw overflow_error("");
00144   } else if ( !end_of_bigs ) { // first one is special
00145     // just append string
00146     // DEBUG cout << "appending " << q << endl;
00147     while( *q ) { bigs[end_of_bigs++] = *q++; }
00148     bigs[end_of_bigs] = 0;
00149 
00150     end_of_inserted_string = bigs + end_of_bigs; 
00151 
00152     end_of_bigs++;
00153     stats_bigstring_insertions++;
00154 
00155   } else {
00156 
00157     while( *q ) {
00158       // skip over common prefix
00159       // try      while( (*q != 0) && ( (*p & ~NODEBIT) == *q ) ) {p++; q++; };
00160       while( (*q != 0) && ( *p == *q ) ) {p++; q++; };
00161 
00162       // DEBUG cout << "skipped: q=[" << q << "] vs p=[" << p << "]" << endl;
00163 
00164       // either q string is empty, or p string is empty, or both
00165 
00166       if( (*q == 0) && (*p == 0) ) { 
00167 
00168         // we found the string
00169         // DEGUG cout << "found" << endl;
00170 
00171         end_of_inserted_string = p; 
00172 
00173       } else if( *q == 0 ) { // a longer p string was inserted first
00174         // follow as many branches as possible
00175         // DEBUG cout << "checking for a branch at p=[" << p << endl;
00176 
00177         char *nextp = jumptable->Find(p - bigs);
00178 
00179         while( nextp && (*nextp != 0) ) {
00180           p = nextp;
00181 
00182           nextp = jumptable->Find(p - bigs);
00183 
00184           // DEBUG cout << "branching p=[" << p << "]->[" << nextp << "]" << endl;
00185         }
00186 
00187         if( nextp && (*nextp  == 0) ) { // found the string
00188           // DEBUG cout << "found" << endl;
00189 
00190           p = nextp; 
00191           end_of_inserted_string = nextp; 
00192 
00193         } else { // there are no further branches at p, append q's terminating null 
00194           assert( !nextp );
00195           // DEBUG cout << "appending [" << q << "]" << endl;
00196 
00197           jumptable->Insert(p - bigs, bigs + end_of_bigs);
00198           
00199           stats_jumptable_insertions++;
00200           bigs[end_of_bigs] = 0;
00201 
00202           end_of_inserted_string = bigs + end_of_bigs; 
00203 
00204           end_of_bigs++;
00205           stats_bigstring_insertions++;
00206         }       
00207       } else { // 0 != *q != *p
00208 
00209         char *nextp = jumptable->Find(p - bigs);
00210 
00211         // DEBUG cout << "branching p=[" << p << "]->[" << nextp << "]" << endl;
00212         if( nextp ) { // branch exists
00213           p = nextp;
00214           // and back to while
00215         } else { // no branch at p, so we append q
00216           // DEBUG cout << "appending [" << q << "]" << endl;
00217 
00218           jumptable->Insert(p - bigs, bigs + end_of_bigs);
00219 
00220           stats_jumptable_insertions++;
00221           while( *q ) { bigs[end_of_bigs++] = *q++; }
00222           bigs[end_of_bigs] = 0;
00223 
00224           end_of_inserted_string = bigs + end_of_bigs; 
00225 
00226           end_of_bigs++;
00227           stats_bigstring_insertions++;
00228           // *q == 0 now
00229         }
00230       }
00231     }
00232 
00233   }
00234 
00235   assert(end_of_inserted_string);
00236   assert(*end_of_inserted_string == 0);
00237 
00238   return end_of_inserted_string - bigs;
00239 
00240 }
00241 
00242 
00243 
00244 #ifdef UNIT_TEST
00245 
00246 #define MAKE_TEST_STATEMENT(x) \
00247 do { \
00248   cout << "testing: " #x << endl; \
00249   cout << "output: " << endl; \
00250   (x); \
00251 } while(false)
00252 
00253 #define MAKE_TEST(x,y) \
00254 do { \
00255    cout << argv[0] << ": " << (((x) == (y)) ? "PASSED" : "FAILED") << " " #x <<endl; \
00256    cout << "output: " << (x) << endl; \
00257 } while(false)
00258 
00259 #define MAKE_TEST_EX(x,y) \
00260 do { \
00261    try { \
00262      (x); \
00263      cout << argv[0] << ": FAILED" << " " #x << endl; \
00264    } catch (y) { \
00265      cout << argv[0] << ": PASSED" << " " #x << endl; \
00266    } \
00267 } while(false)
00268 
00269 int main(int argc, char** argv) {
00270   Trie *trie = new Trie(100, 5);
00271   
00272   MAKE_TEST(trie->InsertURL("banana"),6);
00273   MAKE_TEST(trie->InsertURL("apple"),12);
00274   MAKE_TEST(trie->InsertURL("bananarama"),17);
00275   
00276   MAKE_TEST(trie->FindURL("apple"),12);
00277   MAKE_TEST(trie->FindURL("coconut"),-1);
00278 
00279   MAKE_TEST(trie->StatsCumulativeStringSize(), 48);
00280   MAKE_TEST(trie->StatsBigstringInsertions(), 3);
00281   MAKE_TEST(trie->StatsJumptableInsertions(), 2);
00282   
00283   MAKE_TEST_EX(trie->InsertURL("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"),overflow_error);
00284 }
00285 #endif

Generated on Wed May 29 11:37:16 2002 for MarkovPR by doxygen1.2.15