00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "urlfilter.h"
00023 #include <cassert>
00024 #include <ctype.h>
00025 #include <iostream>
00026
00027
00028 URLFilter :: URLFilter(bool removesuf) {
00029 flags.remove_html_suffix = removesuf;
00030 }
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046 const char* URLFilter :: DeindexURL(const char *anurl) {
00047
00048 assert(strlen(anurl) < STRINGBUF_LEN2);
00049 strcpy(deindex_scratchbuf,anurl);
00050
00051 char *p = strstr(deindex_scratchbuf, "/index.htm");
00052 if( p && ((p[10] == 0) || (p[11] == 0)) ) {
00053 *p = 0;
00054 } else if( flags.remove_html_suffix ) {
00055 p = strstr(deindex_scratchbuf, ".htm");
00056 if( p ) {
00057 if(p[4] == 0) {
00058 p[0] = 31;
00059 p[1] = 0;
00060 } else if(p[5] == 0) {
00061 p[0] = 30;
00062 p[1] = 0;
00063 }
00064 } else {
00065 p = strstr(deindex_scratchbuf, ".shtm");
00066 if( p ) {
00067 if(p[5] == 0) {
00068 p[0] = 29;
00069 p[1] = 0;
00070 } else if(p[6] == 0) {
00071 p[0] = 28;
00072 p[1] = 0;
00073 }
00074 } else {
00075 p = strstr(deindex_scratchbuf, ".asp");
00076 if( p ) {
00077 if(p[4] == 0) {
00078 p[0] = 27;
00079 p[1] = 0;
00080 } else {
00081 p = strstr(deindex_scratchbuf, ".php");
00082 if( p ) {
00083 if(p[4] == 0) {
00084 p[0] = 26;
00085 p[1] = 0;
00086 }
00087 }
00088 }
00089 }
00090 }
00091 }
00092 }
00093 return deindex_scratchbuf;
00094 }
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107 const char* URLFilter :: CompressURL(const char *anurl) {
00108 const char *p = anurl;
00109 char *q = comp_scratchbuf;
00110 while( *p ) {
00111 if( p[1] == '/' ) {
00112 *q++ = (*p | SLASHBIT);
00113 p++;
00114 } else {
00115 *q++ = *p;
00116 }
00117 p++;
00118 }
00119 *q = 0;
00120 return anurl;
00121 }
00122
00123 char *stringreverse(char *str)
00124 {
00125 register int max,high,low;
00126
00127 max=strlen(str);
00128 high=max-1;
00129 low=0;
00130
00131 while (low < high)
00132 {
00133 str[max] = str[low];
00134 str[low++] = str[high];
00135 str[high--] = str[max];
00136 }
00137
00138 str[max] = 0;
00139 return str;
00140 }
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150 void URLFilter :: ParseURL(const char *anurl, char *schemebuf,
00151 char *netlocbuf, char *querybuf,
00152 char *paramsbuf, char *pathbuf) {
00153
00154 assert( strlen(anurl) <= STRINGBUF_LEN1 );
00155
00156 strcpy(parse_scratchbuf, anurl);
00157
00158 char *q;
00159
00160
00161 char *p = strchr(parse_scratchbuf, '#');
00162 if( p ) { *p = 0; }
00163
00164
00165 p = strchr(parse_scratchbuf, ':');
00166 if( p ) {
00167 int l = p - parse_scratchbuf;
00168 strncpy(schemebuf, parse_scratchbuf, l);
00169 schemebuf[l] = 0;
00170 for(int k = 0; k < l; k++) {
00171 if( !isalnum(schemebuf[k]) && !(schemebuf[k] == '+') &&
00172 !(schemebuf[k] == '.') && !(schemebuf[k] == '-') ) {
00173 schemebuf[0] = 0;
00174 }
00175 schemebuf[k] = tolower(schemebuf[k]);
00176 }
00177 q = p + 1;
00178 } else {
00179 schemebuf[0] = 0;
00180 q = parse_scratchbuf;
00181 }
00182
00183
00184 if( (q[0] == '/') && (q[1] == '/') ) {
00185 p = strchr(q + 2, '/');
00186 if( p ) {
00187 int l = p - q - 2;
00188 strncpy(netlocbuf, q + 2, l);
00189 netlocbuf[l] = 0;
00190 q = p;
00191
00192 } else {
00193 strcpy(netlocbuf, q + 2);
00194 q = parse_scratchbuf + strlen(parse_scratchbuf);
00195 assert( *q == 0 );
00196 }
00197 } else {
00198 netlocbuf[0] = 0;
00199
00200 }
00201
00202
00203 for(char* cp = netlocbuf; *cp; cp++) { *cp = tolower(*cp); }
00204
00205
00206 p = strchr(q, '?');
00207 if( p ) {
00208 strcpy(querybuf, p);
00209 *p = 0;
00210 } else {
00211 querybuf[0] = 0;
00212 }
00213
00214
00215 p = strchr(q, ';');
00216 if( p ) {
00217 strcpy(paramsbuf, p);
00218 *p = 0;
00219 } else {
00220 paramsbuf[0] = 0;
00221 }
00222
00223
00224 strcpy(pathbuf, q);
00225 }
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239 void URLFilter :: NormalizeURLPath(char *apath) {
00240 int j = strlen(apath) - 1;
00241 if( j > 0 ) {
00242 if( apath[j] == '/' ) {
00243 if( j < (STRINGBUF_LEN1 - 11) )
00244 strcat(apath, "index.html");
00245 } else if( apath[j] == 'm' ) {
00246
00247 if( (j >= 8) &&
00248 (strncasecmp(apath + j - 8, "index.ht", 8) == 0) &&
00249 (j < (STRINGBUF_LEN1 - 2)) ) {
00250 apath[++j] = 'l';
00251 apath[++j] = 0;
00252 }
00253 }
00254 } else {
00255 strcpy(apath, "/index.html");
00256 }
00257 }
00258
00259
00260
00261 ContentType URLFilter :: ClassifyURLPath(const char *path) {
00262 ContentType ctype;
00263 int pathlen = strlen(path);
00264
00265 if( (pathlen > 5) && (strncasecmp(path + pathlen - 5, ".html", 5) == 0) ) {
00266 ctype = CONTENT_TEXT_HTML;
00267 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".htm", 4) == 0) ) {
00268 ctype = CONTENT_TEXT_HTML;
00269 } else if( (pathlen > 6) && (strncasecmp(path + pathlen - 6, ".shtml", 6) == 0) ) {
00270 ctype = CONTENT_TEXT_HTML;
00271 } else if( (pathlen > 7) && (strncasecmp(path + pathlen - 7, ".readme", 7) == 0) ) {
00272 ctype = CONTENT_TEXT_PLAIN;
00273 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".asp", 4) == 0) ) {
00274 ctype = CONTENT_TEXT_HTML;
00275 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".lha", 4) == 0) ) {
00276 ctype = CONTENT_APPLICATION_XGZIP;
00277 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".txt", 4) == 0) ) {
00278 ctype = CONTENT_TEXT_PLAIN;
00279 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".rtf", 4) == 0) ) {
00280 ctype = CONTENT_TEXT_RTF;
00281 } else if( (pathlen > 3) && (strncasecmp(path + pathlen - 3, ".ps", 3) == 0) ) {
00282 ctype = CONTENT_APPLICATION_POSTSCRIPT;
00283 } else if( (pathlen > 3) && (strncasecmp(path + pathlen - 3, ".gz", 3) == 0) ) {
00284 ctype = CONTENT_APPLICATION_XGZIP;
00285 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".doc", 4) == 0) ) {
00286 ctype = CONTENT_APPLICATION_MSWORD;
00287 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".ppt", 4) == 0) ) {
00288 ctype = CONTENT_APPLICATION_MS_POWERPOINT;
00289 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".jpg", 4) == 0) ) {
00290 ctype = CONTENT_IMAGE;
00291 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".zip", 4) == 0) ) {
00292 ctype = CONTENT_APPLICATION_XGZIP;
00293 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".gif", 4) == 0) ) {
00294 ctype = CONTENT_IMAGE;
00295 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".png", 4) == 0) ) {
00296 ctype = CONTENT_IMAGE;
00297 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".pdf", 4) == 0) ) {
00298 ctype = CONTENT_APPLICATION_PDF;
00299 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".php", 4) == 0) ) {
00300 ctype = CONTENT_TEXT_HTML;
00301 } else if( (pathlen > 5) && (strncasecmp(path + pathlen - 5, ".jpeg", 5) == 0) ) {
00302 ctype = CONTENT_IMAGE;
00303 } else if( (pathlen > 5) && (strncasecmp(path + pathlen - 5, ".shtm", 5) == 0) ) {
00304 ctype = CONTENT_TEXT_HTML;
00305 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".ram", 4) == 0) ) {
00306 ctype = CONTENT_AUDIO_MP3;
00307 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".mov", 4) == 0) ) {
00308 ctype = CONTENT_AUDIO_MP3;
00309 } else if( (pathlen > 4) && (strncasecmp(path + pathlen - 4, ".mpg", 4) == 0) ) {
00310 ctype = CONTENT_AUDIO_MP3;
00311 } else {
00312
00313 ctype = CONTENT_GOOGLE_OTHER;
00314 }
00315 return ctype;
00316 }
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333 const char * URLFilter :: FormatURL(const char *anurl, int anurl_len,
00334 URLComponents *baseurl, ContentType *foundtype) throw (domain_error) {
00335 assert(anurl_len > 0);
00336
00337
00338
00339
00340
00341 if( anurl_len >= STRINGBUF_LEN1 ) {
00342 cerr << "warning: truncating url" << endl;
00343 anurl_len = STRINGBUF_LEN1;
00344 }
00345
00346 char *fullhref = NULL;;
00347
00348
00349
00350
00351
00352
00353 int a;
00354 for(a = 0; a < anurl_len; a++) {
00355 if( (anurl[a] <= 0x1F) || (anurl[a] >= 0x7F) ) {
00356 break;
00357 } else {
00358 scratchbuf1[a] = anurl[a];
00359 }
00360 }
00361 scratchbuf1[a] = 0;
00362
00363
00364 ParseURL(scratchbuf1, scratchbuf0, scratchbuf3, scratchbuf5, scratchbuf6, scratchbuf2);
00365
00366 NormalizeURLPath(scratchbuf2);
00367
00368 *foundtype = ClassifyURLPath(scratchbuf2);
00369
00370 if( strlen(scratchbuf3) > 0 ) {
00371
00372 if( scratchbuf0[0] ) {
00373 strcpy(scratchbuf1, scratchbuf0);
00374 strcat(scratchbuf1, "://");
00375 } else {
00376 strcpy(scratchbuf1, "http://");
00377 }
00378 strcat(scratchbuf1, scratchbuf3);
00379 strcat(scratchbuf1, scratchbuf2);
00380 strcat(scratchbuf1, scratchbuf6);
00381 strcat(scratchbuf1, scratchbuf5);
00382 fullhref = scratchbuf1;
00383
00384 } else if( baseurl ) {
00385
00386 if( scratchbuf0[0] ) {
00387 strcpy(scratchbuf4, scratchbuf0);
00388 strcat(scratchbuf4, "://");
00389 } else {
00390 strcpy(scratchbuf4, "http://");
00391 }
00392 strcat(scratchbuf4, baseurl->netloc);
00393 strcat(scratchbuf4, "/");
00394
00395
00396 if( scratchbuf2[0] == '/' ) {
00397
00398 strcat(scratchbuf4, scratchbuf2 + 1);
00399 strcat(scratchbuf4, scratchbuf6);
00400 strcat(scratchbuf4, scratchbuf5);
00401 fullhref = scratchbuf4;
00402
00403 } else {
00404
00405
00406
00407
00408 int r = strlen(baseurl->path) - 1;
00409 if( strlen(baseurl->query) || strlen(baseurl->params) ||
00410 ((r > 5) && (strncasecmp(baseurl->path + r - 5, ".shtml",6) == 0)) ||
00411 ((r > 4) && (strncasecmp(baseurl->path + r - 4, ".html",5) == 0)) ||
00412 ((r > 3) && (strncasecmp(baseurl->path + r - 3, ".htm",4) == 0)) ) {
00413
00414 char *s = strrchr(baseurl->path, '/');
00415 assert(s);
00416 char *t = scratchbuf4 + strlen(scratchbuf4);
00417 char *u = baseurl->path + 1;
00418 while( u <= s ) {
00419 *t++ = *u++;
00420 }
00421 *t = 0;
00422
00423
00424 } else {
00425
00426 if( baseurl->path[0] == '/' ) {
00427 strcat(scratchbuf4, baseurl->path + 1);
00428 } else {
00429 strcat(scratchbuf4, baseurl->path);
00430 }
00431
00432
00433 int l = strlen(scratchbuf4) - 1;
00434 if( (l >= 0) && (scratchbuf4[l] != '/') ) {
00435 scratchbuf4[++l] = '/';
00436 scratchbuf4[++l] = 0;
00437 } else if( l < 0) {
00438 scratchbuf4[0] = '/';
00439 scratchbuf4[1] = 0;
00440 }
00441
00442 }
00443
00444
00445
00446 strcat(scratchbuf4, scratchbuf2);
00447
00448 char *p;
00449 while( (p = strstr(scratchbuf4, "../")) ) {
00450 p[0] = p[1] = p[2] = 0x80;
00451 while( *p != '/' ) { *p-- = 0x80; }
00452 *p-- = 0x80;
00453 while( *p != '/' ) { *p-- = 0x80; }
00454 }
00455
00456
00457 while( (p = strstr(scratchbuf4, "./")) ) {
00458 p[0] = p[1] = 0x80;
00459 while( *p != '/' ) { *p-- = 0x80; }
00460 }
00461
00462
00463 char *q = scratchbuf3;
00464 p = scratchbuf4;
00465 while( *p ) {
00466 if( *p & 0x80 ) {
00467 p++;
00468 } else {
00469 *q++ = *p++;
00470 }
00471 }
00472 *q = 0;
00473
00474 strcat(scratchbuf3, scratchbuf6);
00475 strcat(scratchbuf3, scratchbuf5);
00476
00477 fullhref = scratchbuf3;
00478 }
00479 } else {
00480 cerr << "error: relative url encountered but baseurl == NULL" << endl;
00481 throw domain_error("");
00482 }
00483
00484
00485
00486 if( (strncasecmp(fullhref, "file", 4) == 0)
00487 && strstr(fullhref,"htm") ) {
00488 fullhref[0] = 'h';
00489 fullhref[1] = 't';
00490 fullhref[2] = 't';
00491 fullhref[3] = 'p';
00492 }
00493
00494 NormalizeURLPath(fullhref);
00495
00496
00497 assert( strlen(fullhref) < STRINGBUF_LEN2 );
00498
00499 return fullhref;
00500 }
00501
00502 #ifdef UNIT_TEST
00503
00504 #define MAKE_TEST_STATEMENT(x) \
00505 do { \
00506 cout << "testing: " #x << endl; \
00507 cout << "output: " << endl; \
00508 (x); \
00509 } while(false)
00510
00511 #define MAKE_TEST(x,y) \
00512 do { \
00513 cout << argv[0] << ": " << (((x) == (y)) ? "PASSED" : "FAILED") << " " #x <<endl; \
00514 cout << "output: " << (x) << endl; \
00515 } while(false)
00516
00517 #define MAKE_TEST_STRING(x,y) \
00518 do { \
00519 cout << argv[0] << ": " << ((strcmp((x),(y)) == 0) ? "PASSED" : "FAILED") << " " #y <<endl; \
00520 cout << "output: " << (x) << endl; \
00521 } while(false)
00522
00523 int main(int argc, char** argv) {
00524
00525 const char* strings[] = {
00526 "http://www.somewhere.edu/directory/",
00527 "http://www.site.com/default.htm",
00528 "file://www.buffalo.edu/admin/one.html",
00529 "../folder/special.shtm"
00530 };
00531
00532 URLFilter *uf = new URLFilter(false);
00533 URLComponents base;
00534 ContentType contype;
00535
00536 const char* result;
00537
00538 cout << (result = uf->FormatURL(strings[0], strlen(strings[0]), NULL, &contype)) << endl;
00539 MAKE_TEST_STRING(result,"http://www.somewhere.edu/directory/index.html");
00540
00541 cout << (result = uf->FormatURL(strings[1], strlen(strings[1]), NULL, &contype)) << endl;
00542 MAKE_TEST_STRING(result,"http://www.site.com/default.htm");
00543 MAKE_TEST(contype,CONTENT_TEXT_HTML);
00544
00545 cout << (result = uf->FormatURL(strings[2], strlen(strings[2]), NULL, &contype)) << endl;
00546 MAKE_TEST_STRING(result,"http://www.buffalo.edu/admin/one.html");
00547
00548 uf->ParseURL(strings[0], base.scheme, base.netloc, base.query, base.params, base.path);
00549 cout << (result = uf->FormatURL(strings[3], strlen(strings[3]), &base, &contype)) << endl;
00550 MAKE_TEST_STRING(result,"http://www.somewhere.edu/folder/special.shtm");
00551
00552 char bug[10]; strcpy(bug,"hello");
00553 MAKE_TEST_STRING(stringreverse(bug), "olleh");
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564 }
00565 #endif