Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

parsehandler-graph.cc

Go to the documentation of this file.
00001 /** @file parsehandler-graph.cc */
00002 /* 
00003  * Copyright (C) 2002 Laird Breyer
00004  *  
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018  * 
00019  * Author:   Laird Breyer <laird@lbreyer.com>
00020  */
00021 
00022 #include "handler-parser.h"
00023 #include "graphbuilder.h"
00024 #include "content-type.h"
00025 
00026 /// A custom case insensitive strncmp
00027 extern int strincmp(const char *s1, const char *s2, int s2len);
00028 
00029 /// Reads anchor links and other document information for use by GraphBuilder.
00030 /**
00031  * GraphParseHandler is called for each document being processed, and 
00032  * interfaces with GraphBuilder, telling it the current document's 
00033  * URL and anchor links, as well as the document's date.
00034  *
00035  * Not all anchor links are passed on to GraphBuilder, however. GraphParseHandler
00036  * doesn't pass on autolinks, or links to non-html documents.
00037  */
00038 class GraphParseHandler : public ParseHandler {
00039  public:
00040 
00041   GraphParseHandler(GraphBuilder *graphbuilder);
00042 
00043   virtual void NewDocument(const Document* doc);
00044   virtual void AddHeader(const char* key, int keylen,
00045                          const char* value, int valuelen);
00046   virtual void AddAnchor(const char* href, int hreflen);
00047 
00048 private:
00049   GraphBuilder * gb;
00050 };
00051 
00052 ParseHandler* MakeGraphHandler(GraphBuilder *graphbuilder) {
00053   return new GraphParseHandler(graphbuilder);
00054 }
00055 
00056 GraphParseHandler :: GraphParseHandler(GraphBuilder *graphbuilder) {
00057   assert( graphbuilder );
00058   gb = graphbuilder;
00059 }
00060 
00061 void GraphParseHandler :: NewDocument(const Document* doc) {
00062 
00063   // we use doc-url() but prefer doc->url_after_redirects()
00064 
00065   if( strlen(doc->url_after_redirects()) > 0 ) {
00066     gb->NodeSetURL(doc->url_after_redirects(), doc->url());
00067     //    cerr << "info: detected a redirected document url" << endl;
00068   } else {
00069     gb->NodeSetURL(doc->url(), NULL);
00070   }
00071 }
00072 
00073 /// Reads the date and calls gb->NodeSetDate()
00074 void GraphParseHandler :: AddHeader(const char* key, int keylen,
00075                                     const char* value, int valuelen) {
00076   // if we're parsing the date, convert
00077   // to number of days (julian date) and place into current WebNode
00078   if( (strncmp(key,"Date",4) == 0) ||
00079       (strncmp(key,"Last-Modified",13) == 0) ||
00080       (strncmp(key,"X-Google-Crawl-Date",19) == 0) ) {
00081 
00082     // assume date format Fri, 07 Sep 2001 04:55:16 GMT
00083     // also accept Wednesday, 14-Nov-01 00:53:22 GMT
00084     // anything else is deemed a bad date
00085     assert( valuelen < 50 );
00086     char mydate[50];
00087     char * p;
00088     int d=0, m=0, y=0;
00089     bool badformat = false;
00090     int64 jd;
00091     char * daytable = "janfebmaraprmayjunjulaugsepoctnovdec";
00092 
00093     // stupid strtok modifies string
00094     strncpy(mydate,value,valuelen);
00095     mydate[valuelen] = 0;
00096 
00097     // actually, if we blank out the time stamp, then we can parse more date formats
00098     for( p = mydate; *p; p++) {
00099       if( isdigit(p[0]) && isdigit(p[1]) && (p[2] == ':') && 
00100           isdigit(p[3]) && isdigit(p[4]) && (p[5] == ':') &&
00101           isdigit(p[6]) && isdigit(p[7]) && (!isdigit(p[8])) ) {
00102         memset(p, ' ', 8);
00103         break;
00104       }
00105     } 
00106 
00107     // now start tokenizing
00108     if( !strtok(mydate, " -") ) { // ignore day name
00109       badformat = true;
00110     }
00111 
00112     for(int daymon=0; daymon < 2; daymon++) {
00113       p =  strtok(NULL, " -"); // day or month
00114       if( p && isdigit(p[0]) ) { 
00115         d = atoi(p);
00116         if( (d > 31) || (d < 1) ) {
00117           badformat = true;
00118         }
00119       } else if( p && isalnum(p[0]) ) {
00120         m = 0;
00121         for(int k = 0; k < 12; k++) {
00122           int q = 3*k;
00123           if( (daytable[q] == tolower(p[0])) &&
00124               (daytable[q+1] == tolower(p[1])) &&
00125               (daytable[q+2] == tolower(p[2])) ) {
00126             m = k+1;
00127             break;
00128           }
00129         }
00130         if( m == 0 ) {
00131           badformat = true;
00132         }
00133       } else {
00134         badformat = true;
00135       }
00136     }
00137 
00138     p = strtok(NULL, " -"); // year
00139     if( p ) {
00140       y = atoi(p);
00141       if( y < 100 ) {
00142         y = (y < 50) ? (y + 2000) : (y + 1900);
00143       } else if( y < 1994) {
00144         if( y == 100 ) {
00145           y = 2000;
00146         } else if( y == 101 ) {
00147           y = 2001;
00148         } else {
00149           badformat = true;
00150         }
00151       } else if( y > 2010) {
00152         badformat = true;
00153       }
00154     } else {
00155       badformat = true;
00156     }
00157 
00158     if( !badformat ) {
00159       // convert to julian day number, since 15 Oct 1582
00160       jd = ( 1461 * ( y + 4800 + ( m - 14 ) / 12 ) ) / 4 +
00161         ( 367 * ( m - 2 - 12 * ( ( m - 14 ) / 12 ) ) ) / 12 -
00162         ( 3 * ( ( y + 4900 + ( m - 14 ) / 12 ) / 100 ) ) / 4 +
00163         d + 2367925;
00164       jd -= 4845000; // convert to int - offset arbitrary
00165       // DEBUG cout << d << " " << m << " " << y << " " << jd << endl;
00166     } else {
00167       jd = 0;
00168       strncpy(mydate,value,valuelen); // trailing null is still there
00169       cerr << "warning: cannot parse date, setting to zero [" << mydate << "]" << endl;
00170     }
00171 
00172     // now inser the date into the new node
00173     assert( jd <= kuint16max );
00174     gb->NodeSetDate((uint16)jd);
00175   }
00176 }
00177 
00178 void GraphParseHandler :: AddAnchor(const char* href, int hreflen) {
00179   // here we try to classify the type of anchor link, to see if it's
00180   // worth adding to the web graph. We *don't* convert %-encoded characters,
00181   // so obfuscated urlst are not treated properly - tough. Also, we don't
00182   // take particularly good care about case handling, which depends upon
00183   // the server's filesystem conventions.
00184   if( hreflen <= 0 ) { // wtf? discard
00185     return;
00186   } else if( href[0] == '#' ) { // link back to itself, discard
00187     return;
00188   } else if( (hreflen > 6) && (strncasecmp(href, "ftp://", 6) == 0) ) { 
00189     // most likely data file, discard
00190     return;
00191   } else if( (hreflen > 9) && (strncasecmp(href, "gopher://", 9) == 0) ) { 
00192     // foreign protocol, no hyperlinks
00193     return;
00194   } else if( (hreflen > 7) && (strncasecmp(href, "mailto:", 7) == 0) ) { 
00195     // no hyperlinks
00196     return;
00197   } else if( (hreflen > 5) && (strncasecmp(href, "nntp:", 5) == 0) ) { 
00198     // foreign protocol
00199     return;
00200   } else if( (hreflen > 5) && (strncasecmp(href, "news:", 5) == 0) ) { 
00201     // like nntp
00202     return;
00203   } else if( (hreflen > 7) && (strncasecmp(href, "telnet:", 7) == 0) ) { 
00204     // foreign protocol, no hyperlinks
00205     return;
00206   } else if( (hreflen > 7) && (strncasecmp(href, "tn3270:", 7) == 0) ) { 
00207     // like telnet
00208     return;
00209   } else if( (hreflen > 5) && (strncasecmp(href, "wais:", 5) == 0) ) { 
00210     // foreign
00211     return;
00212   } else if( (hreflen > 5) && (strncasecmp(href, "file:", 5) == 0) ) { 
00213     // this is debatable whether to accept, since some people
00214     // link html files this way
00215     char *p = strstr(href,"htm");
00216     if( p && ((p - href) < hreflen) ) { 
00217       // seems to contain an (s)htm(l) file
00218     } else { // discard
00219       return;
00220     }
00221   } else if( (hreflen > 11) && (strncasecmp(href, "javascript:", 11) == 0) ) { 
00222     // junk, discard - Die X10!
00223     return;
00224   } else if( (hreflen > 9) && (strncasecmp(href, "prospero:", 11) == 0) ) { 
00225     // foreign, discard
00226     return;
00227   }
00228 
00229   ContentType ctype;
00230   const char *url = gb->FormatURL(href, hreflen, &ctype);
00231 
00232   // throw away images and postscript documents, but keep GOOGLE_OTHER,
00233   // which means we didn't know how to classify the link
00234   switch (ctype) {
00235   case CONTENT_APPLICATION_POSTSCRIPT:
00236   case CONTENT_APPLICATION_PDF:
00237   case CONTENT_APPLICATION_MSWORD:
00238   case CONTENT_TEXT_RTF:
00239   case CONTENT_APPLICATION_MS_POWERPOINT:
00240   case CONTENT_APPLICATION_XGZIP:
00241   case CONTENT_IMAGE:
00242   case CONTENT_TEXT_PLAIN:
00243   case CONTENT_AUDIO_MP3:
00244     return;
00245     break;
00246   case CONTENT_GOOGLE_OTHER:
00247   case CONTENT_TEXT_HTML:
00248   default:
00249     break;
00250     }
00251 
00252   // don't allow a link to the current document
00253   if( strcmp(url, gb->NodeGetURL()) != 0 ) {
00254     gb->TrieInsertLinkURL(url); 
00255     if(gb->flags.show_links) {
00256       cout << " * " << url << endl;
00257     }
00258   } 
00259 } 
00260 
00261 
00262 
00263 
00264 
00265 
00266 
00267 
00268 
00269 
00270 

Generated on Wed May 29 11:37:15 2002 for MarkovPR by doxygen1.2.15