00001 /** @file trie.h */ 00002 /* 00003 * Copyright (C) 2002 Laird Breyer 00004 * 00005 * This program is free software; you can redistribute it and/or modify 00006 * it under the terms of the GNU General Public License as published by 00007 * the Free Software Foundation; either version 2 of the License, or 00008 * (at your option) any later version. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU General Public License 00016 * along with this program; if not, write to the Free Software 00017 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00018 * 00019 * Author: Laird Breyer <laird@lbreyer.com> 00020 */ 00021 00022 #ifndef _TRIE_H_ 00023 #define _TRIE_H_ 00024 #include <iostream> 00025 #include "simplehash.h" 00026 #include <stdexcept> 00027 00028 #define STRINGBUF_LEN0 511 00029 #define STRINGBUF_LEN1 1023 00030 #define STRINGBUF_LEN2 2047 00031 #define STRINGBUF_LEN3 4095 00032 00033 /// Hashtable used to navigate trie. 00034 typedef SimpleHashTable<char*> SimpleCharPtrHashTable; 00035 00036 /// Stores URL strings by superposition of common prefixes 00037 /** 00038 * This class implements a classic trie structure (Knuth, Vol. 3) 00039 * which consists of a very long string space together with a hashtable 00040 * (jumptable) which allows navigation. 00041 * 00042 * The trie allows large space savings by storing common string prefixes only 00043 * once. 00044 */ 00045 class Trie { 00046 public: 00047 Trie(unsigned long slen, long jlen); 00048 ptrdiff_t FindURL(const char *url); 00049 ptrdiff_t InsertURL(const char *url) throw (overflow_error); 00050 00051 void Statistics(ostream& o); 00052 00053 uint32 StatsCumulativeStringSize() 00054 { return stats_cumulative_string_size; } 00055 uint32 StatsBigstringInsertions() 00056 { return stats_bigstring_insertions; } 00057 uint32 StatsJumptableInsertions() 00058 { return stats_jumptable_insertions; } 00059 00060 char* bigs; 00061 00062 protected: 00063 00064 unsigned long slen_; 00065 unsigned long end_of_bigs; 00066 00067 SimpleCharPtrHashTable *jumptable; 00068 00069 uint32 stats_cumulative_string_size; 00070 uint32 stats_bigstring_insertions; 00071 uint32 stats_jumptable_insertions; 00072 00073 }; 00074 #endif