Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

urlfilter.cc

Go to the documentation of this file.
00001 /** @file urlfilter.cc */
00002 /* 
00003  * Copyright (C) 2002 Laird Breyer
00004  *  
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018  * 
00019  * Author:   Laird Breyer <laird@lbreyer.com>
00020  */
00021 
00022 #include "urlfilter.h"
00023 #include <cassert>
00024 #include <ctype.h>
00025 #include <iostream>
00026 
00027 
00028 URLFilter :: URLFilter(bool removesuf) {
00029   flags.remove_html_suffix = removesuf;
00030 }
00031 
00032 
00033 /**
00034  * this function takes a standardized url ( see NormalizeURLPath() )
00035  * and removes the trailing string /index.htm(l)
00036  * This is used to compactify the string before adding
00037  * it to the Trie (in a trie, common prefixes are harmless, but
00038  * common suffixes waste space) In case the remove_html_suffix
00039  * flag is set, other common html endings are also tokenized to
00040  * reduce space requirements.
00041  *
00042  * Note that this operation is irreversible (we cannot
00043  * reinsert the suffix /index.html reliably in all cases).
00044  * The string anurl is not modified.
00045  */
00046 const char* URLFilter :: DeindexURL(const char *anurl) {
00047   // make a copy of the string
00048   assert(strlen(anurl) < STRINGBUF_LEN2);
00049   strcpy(deindex_scratchbuf,anurl);
00050 
00051   char *p = strstr(deindex_scratchbuf, "/index.htm");
00052   if( p && ((p[10] == 0) || (p[11] == 0)) ) {
00053     *p = 0;
00054   } else if( flags.remove_html_suffix ) { 
00055     p = strstr(deindex_scratchbuf, ".htm");
00056     if( p ) {
00057       if(p[4] == 0) { // ends in .htm
00058         p[0] = 31; // nonprintable
00059         p[1] = 0;
00060       } else if(p[5] == 0) { // ends in .html
00061         p[0] = 30;
00062         p[1] = 0;
00063       }
00064     } else { 
00065       p = strstr(deindex_scratchbuf, ".shtm");
00066       if( p ) {
00067         if(p[5] == 0) { // ends in .shtm
00068           p[0] = 29;
00069           p[1] = 0;
00070         } else if(p[6] == 0) { // ends in .shtml
00071           p[0] = 28;
00072           p[1] = 0;
00073         }
00074       } else { 
00075         p = strstr(deindex_scratchbuf, ".asp");
00076         if( p ) {
00077           if(p[4] == 0) { // ends in .asp
00078             p[0] = 27;
00079             p[1] = 0;
00080           } else {
00081             p = strstr(deindex_scratchbuf, ".php");
00082             if( p ) {
00083               if(p[4] == 0) { // ends in .php
00084                 p[0] = 26;
00085                 p[1] = 0;
00086               }
00087             }
00088           }
00089         }
00090       }
00091     }
00092   }
00093   return deindex_scratchbuf;
00094 }
00095 
00096 /**
00097  * This function compresses a URL, whose characters are 
00098  * guaranteed to fit within seven bits, and removes all
00099  * the forward slashes, which are the most commonly used
00100  * character. Everytime a slash is removed, the *preceding*
00101  * charater has its eight bit set. A slash is not removed
00102  * if the previous character already has its eight bit set.
00103  *
00104  * The compressed URL is always located in the special buffer
00105  * comp_scratchbuf[]. The string anurl is not modified
00106  */ 
00107 const char* URLFilter :: CompressURL(const char *anurl) {
00108   const char *p = anurl; 
00109   char *q = comp_scratchbuf;
00110   while( *p ) {
00111     if( p[1] == '/' ) {
00112       *q++ = (*p | SLASHBIT); // print *p with slashbit
00113       p++;
00114     } else {
00115       *q++ = *p; // just print *p
00116     }
00117     p++;
00118   }
00119   *q = 0;
00120   return anurl;
00121 }
00122 
00123 char *stringreverse(char *str)
00124 {
00125   register int max,high,low;
00126 
00127   max=strlen(str);
00128   high=max-1;
00129   low=0;
00130 
00131   while (low < high)
00132     {
00133     str[max] = str[low];
00134     str[low++] = str[high];
00135     str[high--] = str[max];
00136     }
00137 
00138   str[max] = 0;
00139   return str;
00140 }
00141 
00142 /// Decomposes a URL into its components for analysis.
00143 /**
00144  * Each of the supplied buffers must be STRINGBUF_LEN1 long.
00145  * This function does not modify anurl.
00146  *
00147  * If flags.rearrange_components is true, the network location
00148  * and file path are rearranged so that the suffix is placed first.
00149  */
00150 void URLFilter :: ParseURL(const char *anurl, char *schemebuf, 
00151                               char *netlocbuf, char *querybuf,
00152                               char *paramsbuf, char *pathbuf) {
00153   
00154   assert( strlen(anurl) <= STRINGBUF_LEN1 );
00155 
00156   strcpy(parse_scratchbuf, anurl);
00157   
00158   char *q;
00159 
00160   // get rid of the fragment if it's there
00161   char *p = strchr(parse_scratchbuf, '#');
00162   if( p ) { *p = 0; }
00163 
00164   // now get scheme
00165   p = strchr(parse_scratchbuf, ':');
00166   if( p ) {
00167     int l = p - parse_scratchbuf;
00168     strncpy(schemebuf, parse_scratchbuf, l);
00169     schemebuf[l] = 0;
00170     for(int k = 0; k < l; k++) {
00171       if( !isalnum(schemebuf[k]) && !(schemebuf[k] == '+') &&
00172           !(schemebuf[k] == '.') && !(schemebuf[k] == '-') ) { // illegal
00173         schemebuf[0] = 0;
00174       }
00175       schemebuf[k] = tolower(schemebuf[k]);
00176     }
00177     q = p + 1; //strcpy(parse_scratchbuf2, p + 1); // rid of scheme
00178   } else { 
00179     schemebuf[0] = 0;
00180     q = parse_scratchbuf; //strcpy(parse_scratchbuf2, parse_scratchbuf1); // rid of scheme
00181   }
00182 
00183   // now get network location
00184   if( (q[0] == '/') && (q[1] == '/') ) { // double slash
00185     p = strchr(q + 2, '/');
00186     if( p ) {
00187       int l = p - q - 2;
00188       strncpy(netlocbuf, q + 2, l);
00189       netlocbuf[l] = 0;
00190       q = p;
00191       // strcpy(parse_scratchbuf3, p);
00192     } else {
00193       strcpy(netlocbuf, q + 2);
00194       q = parse_scratchbuf + strlen(parse_scratchbuf);
00195       assert( *q == 0 );
00196     }
00197   } else { // no double slash found
00198     netlocbuf[0] = 0;
00199     // q is still good
00200   }
00201   // we convert netloc to lowercase - 
00202   // could be problematic if login/passwords are used though
00203   for(char* cp = netlocbuf; *cp; cp++) { *cp = tolower(*cp); }
00204 
00205   // remove query string if present
00206   p = strchr(q, '?');
00207   if( p ) {
00208     strcpy(querybuf, p);
00209     *p = 0;
00210   } else {
00211     querybuf[0] = 0;
00212   }
00213 
00214   // remove parameters if present
00215   p = strchr(q, ';');
00216   if( p ) {
00217     strcpy(paramsbuf, p);
00218     *p = 0;
00219   } else {
00220     paramsbuf[0] = 0;
00221   }
00222 
00223   // what's left is the path
00224   strcpy(pathbuf, q);
00225 }
00226 
00227 
00228 
00229 /// Fixes the path in case document is of type index.html 
00230 /**
00231  * This function maps paths of the form
00232  * xxx/
00233  * xxx/index.htm     to the standard  xxx/index.html
00234  * xxx/index.html    
00235  *
00236  * WARNING: This function modifies the string apath. It is 
00237  * assumed that apath has STRINGBUF_LEN1 storage available
00238  */
00239 void URLFilter :: NormalizeURLPath(char *apath) {
00240   int j = strlen(apath) - 1;
00241   if( j > 0 ) {
00242     if( apath[j] == '/' ) {
00243       if( j < (STRINGBUF_LEN1 - 11) )
00244         strcat(apath, "index.html");
00245     } else if( apath[j] == 'm' ) { 
00246       // is it index.htm, then append 'l'
00247       if( (j >= 8) && 
00248           (strncasecmp(apath + j - 8, "index.ht", 8) == 0) &&
00249           (j < (STRINGBUF_LEN1 - 2)) ) {
00250         apath[++j] = 'l';
00251         apath[++j] = 0;
00252       }
00253     }
00254   } else { // path is empty
00255     strcpy(apath, "/index.html");
00256   }
00257 }
00258 
00259 
00260 /// classifies a file according to its extension
00261 ContentType URLFilter :: ClassifyURLPath(const char *path) {
00262   ContentType ctype;
00263   int pathlen = strlen(path);
00264 
00265   if( (pathlen > 5) && (strncasecmp(path + pathlen - 5, ".html", 5) == 0) ) {
00266     ctype = CONTENT_TEXT_HTML;
00267   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".htm", 4) == 0) ) {
00268     ctype = CONTENT_TEXT_HTML;
00269   } else if( (pathlen > 6) && (strncasecmp(path + pathlen - 6, ".shtml", 6) == 0) ) {
00270     ctype = CONTENT_TEXT_HTML;
00271   } else if( (pathlen > 7) && (strncasecmp(path + pathlen - 7, ".readme", 7) == 0) ) {
00272     ctype = CONTENT_TEXT_PLAIN;
00273   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".asp", 4) == 0) ) {
00274     ctype = CONTENT_TEXT_HTML;
00275   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".lha", 4) == 0) ) {
00276     ctype = CONTENT_APPLICATION_XGZIP;
00277   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".txt", 4) == 0) ) {
00278     ctype = CONTENT_TEXT_PLAIN;
00279   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".rtf", 4) == 0) ) {
00280     ctype = CONTENT_TEXT_RTF;
00281   } else if( (pathlen > 3) && (strncasecmp(path + pathlen - 3, ".ps", 3) == 0) ) {
00282     ctype = CONTENT_APPLICATION_POSTSCRIPT;
00283   } else if( (pathlen > 3) && (strncasecmp(path + pathlen - 3, ".gz", 3) == 0) ) {
00284     ctype = CONTENT_APPLICATION_XGZIP;
00285   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".doc", 4) == 0) ) {
00286     ctype = CONTENT_APPLICATION_MSWORD;
00287   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".ppt", 4) == 0) ) {
00288     ctype = CONTENT_APPLICATION_MS_POWERPOINT;
00289   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".jpg", 4) == 0) ) {
00290     ctype = CONTENT_IMAGE;
00291   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".zip", 4) == 0) ) {
00292     ctype = CONTENT_APPLICATION_XGZIP;
00293   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".gif", 4) == 0) ) {
00294     ctype = CONTENT_IMAGE;
00295   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".png", 4) == 0) ) {
00296     ctype = CONTENT_IMAGE;
00297   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".pdf", 4) == 0) ) {
00298     ctype = CONTENT_APPLICATION_PDF;
00299   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".php", 4) == 0) ) {
00300     ctype = CONTENT_TEXT_HTML;
00301   } else if( (pathlen > 5) && (strncasecmp(path + pathlen - 5, ".jpeg", 5) == 0) ) {
00302     ctype = CONTENT_IMAGE;
00303   } else if( (pathlen > 5) && (strncasecmp(path + pathlen - 5, ".shtm", 5) == 0) ) {
00304     ctype = CONTENT_TEXT_HTML;
00305   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".ram", 4) == 0) ) {
00306     ctype = CONTENT_AUDIO_MP3;
00307   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".mov", 4) == 0) ) {
00308     ctype = CONTENT_AUDIO_MP3;
00309   } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".mpg", 4) == 0) ) {
00310     ctype = CONTENT_AUDIO_MP3;
00311   } else {
00312     // otherwise, we don't recognize the type
00313     ctype = CONTENT_GOOGLE_OTHER;
00314   }
00315   return ctype;
00316 }
00317 
00318 
00319 // note this function completes relative urls with the current
00320 // document name. This is only possible if another_url is 
00321 // already defined.
00322 //
00323 //
00324 /**
00325  * This function formats anurl into a standard form. Its most important
00326  * use is as a completion mechanism for URL fragments as can be found
00327  * in anchor tags. The URL is completed relative to the baseurl, which
00328  * typically is the current document's url.
00329  *
00330  * The return value will always be a pointer to one of the scratch buffers
00331  * so you should copy the returned string before formatting another.
00332  */
00333 const char * URLFilter :: FormatURL(const char *anurl, int anurl_len, 
00334                                     URLComponents *baseurl, ContentType *foundtype) throw (domain_error) {
00335   assert(anurl_len > 0);
00336 
00337   // overflow possibility:
00338   // if hreflen is greater than 1024, then the url is likely machine
00339   // generated and contains some crypto key - we keep only the fist
00340   // 1024 chars, since the resulting string is most likely unique already
00341   if( anurl_len >= STRINGBUF_LEN1 ) {
00342     cerr << "warning: truncating url" << endl;
00343     anurl_len = STRINGBUF_LEN1; 
00344   }
00345 
00346   char *fullhref = NULL;;
00347 
00348   // copy and make null terminated
00349   // also make sure that the string is composed of valid characters
00350   // from RFC 2396, we shall accept all ASCII chars excluding only
00351   // 0x00-0x1F, 0x7F, which are unprintable. This allows some non-URI
00352   // characters through, but those are harmless. 
00353   int a;
00354   for(a = 0; a < anurl_len; a++) {
00355     if( (anurl[a] <= 0x1F) || (anurl[a] >= 0x7F) ) {
00356       break;
00357     } else {
00358       scratchbuf1[a] = anurl[a];
00359     }
00360   }
00361   scratchbuf1[a] = 0;
00362   
00363   // this replaces the existing contents of all scratchbufs
00364   ParseURL(scratchbuf1, scratchbuf0, scratchbuf3, scratchbuf5, scratchbuf6, scratchbuf2);
00365   // if path ends in / we append index.html
00366   NormalizeURLPath(scratchbuf2);
00367   // set the content type
00368   *foundtype = ClassifyURLPath(scratchbuf2);
00369 
00370   if( strlen(scratchbuf3) > 0 ) { // if network location defined
00371 
00372     if( scratchbuf0[0] ) {
00373       strcpy(scratchbuf1, scratchbuf0);
00374       strcat(scratchbuf1, "://");
00375     } else { // insert default protocol scheme
00376       strcpy(scratchbuf1, "http://");
00377     }
00378     strcat(scratchbuf1, scratchbuf3);
00379     strcat(scratchbuf1, scratchbuf2);
00380     strcat(scratchbuf1, scratchbuf6);
00381     strcat(scratchbuf1, scratchbuf5);
00382     fullhref = scratchbuf1;
00383 
00384   } else if( baseurl ) { // it's relative to our document's network location
00385 
00386     if( scratchbuf0[0] ) {
00387       strcpy(scratchbuf4, scratchbuf0);
00388       strcat(scratchbuf4, "://");
00389     } else { // insert default protocol scheme
00390       strcpy(scratchbuf4, "http://");
00391     }
00392     strcat(scratchbuf4, baseurl->netloc);
00393     strcat(scratchbuf4, "/");
00394 
00395     // the path may be absolute or relative
00396     if( scratchbuf2[0] == '/' ) { // absolute
00397 
00398       strcat(scratchbuf4, scratchbuf2 + 1);
00399       strcat(scratchbuf4, scratchbuf6);
00400       strcat(scratchbuf4, scratchbuf5);
00401       fullhref = scratchbuf4;
00402 
00403     } else { // relative path
00404 
00405       // if it's a script or an html file we remove everything
00406       // until the last /
00407 
00408       int r = strlen(baseurl->path) - 1;
00409       if( strlen(baseurl->query) || strlen(baseurl->params) ||
00410           ((r > 5) && (strncasecmp(baseurl->path + r - 5, ".shtml",6) == 0)) ||
00411           ((r > 4) && (strncasecmp(baseurl->path + r - 4, ".html",5) == 0)) ||
00412           ((r > 3) && (strncasecmp(baseurl->path + r - 3, ".htm",4) == 0)) ) {
00413         
00414         char *s = strrchr(baseurl->path, '/');
00415         assert(s);
00416         char *t = scratchbuf4 + strlen(scratchbuf4);
00417         char *u = baseurl->path + 1;
00418         while( u <= s ) {
00419           *t++ = *u++;
00420         }
00421         *t = 0;
00422         // now ends with a slash
00423 
00424       } else { // neither script nor html, assume it's a bona fide directory
00425 
00426         if( baseurl->path[0] == '/' ) {
00427           strcat(scratchbuf4, baseurl->path + 1);
00428         } else {
00429           strcat(scratchbuf4, baseurl->path);
00430         }
00431 
00432         // append trailing slash if needed
00433         int l = strlen(scratchbuf4) - 1;
00434         if( (l >= 0) && (scratchbuf4[l] != '/') ) {
00435           scratchbuf4[++l] = '/';
00436           scratchbuf4[++l] = 0;
00437         } else if( l < 0) { 
00438           scratchbuf4[0] = '/';
00439           scratchbuf4[1] = 0;
00440         }
00441         // now ends with a slash
00442       }
00443 
00444        // now we can append the actual relative path 
00445       // note this doesnt start with /
00446       strcat(scratchbuf4, scratchbuf2);
00447       // now remove ../ patterns
00448       char *p;
00449       while( (p = strstr(scratchbuf4, "../")) ) {
00450         p[0] = p[1] = p[2] = 0x80;
00451         while( *p != '/' ) { *p-- = 0x80; }
00452         *p-- = 0x80;
00453         while( *p != '/' ) { *p-- = 0x80; }
00454       }
00455 
00456       // next remove ./ patterns
00457       while( (p = strstr(scratchbuf4, "./")) ) {
00458         p[0] = p[1] = 0x80;
00459         while( *p != '/' ) { *p-- = 0x80; }
00460       }
00461 
00462       // and clean up
00463       char *q = scratchbuf3; 
00464       p = scratchbuf4;
00465       while( *p ) { 
00466         if( *p & 0x80 ) {
00467           p++;
00468         } else {
00469           *q++ = *p++;
00470         }
00471       }
00472       *q = 0;
00473 
00474       strcat(scratchbuf3, scratchbuf6);
00475       strcat(scratchbuf3, scratchbuf5);
00476 
00477       fullhref = scratchbuf3;
00478     }
00479   } else { // relative url but baseurl == NULL
00480     cerr << "error: relative url encountered but baseurl == NULL" << endl;
00481     throw domain_error("");
00482   }
00483 
00484   // extra bit of code: some people use file: when they mean
00485   // http:, so here we replace the file scheme with http
00486   if( (strncasecmp(fullhref, "file", 4) == 0) 
00487     && strstr(fullhref,"htm") ) {
00488     fullhref[0] = 'h';
00489     fullhref[1] = 't';
00490     fullhref[2] = 't';
00491     fullhref[3] = 'p';
00492   }
00493 
00494   NormalizeURLPath(fullhref);
00495   // DEBUG cout << " * " << fullhref << endl;
00496  
00497   assert( strlen(fullhref) < STRINGBUF_LEN2 );
00498 
00499   return fullhref;
00500 }
00501 
00502 #ifdef UNIT_TEST
00503 
00504 #define MAKE_TEST_STATEMENT(x) \
00505 do { \
00506   cout << "testing: " #x << endl; \
00507   cout << "output: " << endl; \
00508   (x); \
00509 } while(false)
00510 
00511 #define MAKE_TEST(x,y) \
00512 do { \
00513    cout << argv[0] << ": " << (((x) == (y)) ? "PASSED" : "FAILED") << " " #x <<endl; \
00514    cout << "output: " << (x) << endl; \
00515 } while(false)
00516 
00517 #define MAKE_TEST_STRING(x,y) \
00518 do { \
00519    cout << argv[0] << ": " << ((strcmp((x),(y)) == 0) ? "PASSED" : "FAILED") << " " #y <<endl; \
00520    cout << "output: " << (x) << endl; \
00521 } while(false)
00522 
00523 int main(int argc, char** argv) {
00524 
00525   const char* strings[] = {
00526     "http://www.somewhere.edu/directory/",
00527     "http://www.site.com/default.htm",
00528     "file://www.buffalo.edu/admin/one.html",
00529     "../folder/special.shtm"
00530   };
00531 
00532   URLFilter *uf = new URLFilter(false);
00533   URLComponents base;
00534   ContentType contype;
00535 
00536   const char* result;
00537   
00538   cout << (result = uf->FormatURL(strings[0], strlen(strings[0]), NULL, &contype)) << endl;
00539   MAKE_TEST_STRING(result,"http://www.somewhere.edu/directory/index.html");
00540 
00541   cout << (result = uf->FormatURL(strings[1], strlen(strings[1]), NULL, &contype)) << endl;
00542   MAKE_TEST_STRING(result,"http://www.site.com/default.htm");
00543   MAKE_TEST(contype,CONTENT_TEXT_HTML);
00544 
00545   cout << (result = uf->FormatURL(strings[2], strlen(strings[2]), NULL, &contype)) << endl;
00546   MAKE_TEST_STRING(result,"http://www.buffalo.edu/admin/one.html");
00547 
00548   uf->ParseURL(strings[0], base.scheme, base.netloc, base.query, base.params, base.path);
00549   cout << (result = uf->FormatURL(strings[3], strlen(strings[3]), &base, &contype)) << endl;
00550   MAKE_TEST_STRING(result,"http://www.somewhere.edu/folder/special.shtm");
00551 
00552   char bug[10]; strcpy(bug,"hello");
00553   MAKE_TEST_STRING(stringreverse(bug), "olleh");
00554 
00555 //   cout << (result = uf->FormatURL(strings[0], strlen(strings[0]), &base, &contype)) << endl;
00556 //   MAKE_TEST_STRING(result,"http://www.somewhere.edu/directory/index.html");
00557 
00558 //   cout << (result = uf->FormatURL(strings[0], strlen(strings[0]), &base, &contype)) << endl;
00559 //   MAKE_TEST_STRING(result,"http://www.somewhere.edu/directory/index.html");
00560 
00561 //   cout << (result = uf->FormatURL(strings[0], strlen(strings[0]), &base, &contype)) << endl;
00562 //   MAKE_TEST_STRING(result,"http://www.somewhere.edu/directory/index.html");
00563 
00564 }
00565 #endif

Generated on Wed May 29 11:37:16 2002 for MarkovPR by doxygen1.2.15