Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

ripper.cc

Go to the documentation of this file.
00001 /** @file ripper.cc */
00002 //***************************************************************************
00003 // This source code is copyrighted 2002 by Google Inc.  All rights
00004 // reserved.  You are given a limited license to use this source code for
00005 // purposes of participating in the Google programming contest.  If you
00006 // choose to use or distribute the source code for any other purpose, you
00007 // must either (1) first obtain written approval from Google, or (2)
00008 // prominently display the foregoing copyright notice and the following
00009 // warranty and liability disclaimer on each copy used or distributed.
00010 // 
00011 // The source code and repository (the "Software") is provided "AS IS",
00012 // with no warranty, express or implied, including but not limited to the
00013 // implied warranties of merchantability and fitness for a particular
00014 // use.  In no event shall Google Inc. be liable for any damages, direct
00015 // or indirect, even if advised of the possibility of such damages.
00016 //***************************************************************************
00017 
00018 
00019 // The Ripper iterates over one or more repository files, calling
00020 // handler methods that act on each document in turn. Handlers are
00021 // instances of a ParseHandler sub-class. See files
00022 // parsehandler-caturl.cc and parsehandler-preparsecat.cc for examples
00023 // of ParseHandler sub-classes.  To add a new handler to the ripper,
00024 // look at the sections of this file labelled "*NEW HANDLERS*" and
00025 // make the indicated modifications. Handlers make take arguments
00026 // supplied on the ripper command line.
00027 //
00028 // Note: this is the public ripper, which operates on pre-parsed repositories.
00029 
00030 #include <fstream>
00031 #include <sstream>
00032 #include <vector>
00033 #include "handler-parser.h"
00034 #include "repos-reader.h"
00035 #include "parseelt.h"
00036 
00037 // *NEW HANDLERS* - add includes here if needed
00038 #include <fstream>
00039 #include "graphbuilder.h"
00040 #include "linkgraph.h"
00041 #include "talker.h"
00042 
00043 static void error(string errmsg) { cerr << errmsg << std::endl; exit(1); }
00044 
00045 static void usage() {
00046   error("usage: ripper [--stop_after n] <handler options> "
00047         "{- | <repository files>}\n"
00048         "possible handler options are:\n"
00049         "--pvm_slave x         declares this ripper as a slave in a group of x tasks\n"
00050         "--pvm_master x        declares this ripper as the master in a group of x tasks\n"
00051         "--cat                 prints the full contents of each document\n"
00052         "--caturl              prints the url of each document\n"
00053         "--catlinks            prints the url and tolinks of each document\n"
00054         "--no_graphbuilder     doesn't build the web link graph\n"
00055         "--start_after x       starts numbering documents from x\n"
00056         "--stop_after x        stops processing after document number x\n"
00057         "--save_graph          prints the web link graph\n"
00058         "--save_index          prints an index of the documents read\n"
00059         "--string_memory x     reserves x Megabytes for all url strings\n"
00060         "--jumptable_memory x  reserves x Megabytes for all trie branch pointers\n"
00061         "--nodetable_memory x  reserves x Megabytes for the webnode hashtable\n"
00062         "--leaftable_memory x  reserves x Megabytes for the leafnode hashtable\n"
00063         "--name arthurdent     names this ripper arthurdent - no quotes please\n"
00064         "--temp_dir z          uses z as the temporary directory. Default is /tmp/\n"
00065         "--quit                noninteractive mode.\n"
00066 );
00067 }
00068 
00069 static char* defaultrippername = "";
00070 static char* defaulttempdir = "/tmp/";
00071 int num_docs_processed = 0;
00072 
00073 #define RIPPER_NAMELEN 20
00074 #define RIPPER_TMPDIRLEN 20
00075 
00076 /// Rips the repository and interfaces with main(). Calls GraphBuilder to build the web link graph
00077 class Ripper {
00078  public:
00079   Ripper();
00080   ~Ripper();
00081   void SetupHandlers();
00082   void ParseCmdLineArgs(int argc, char** argv);
00083   void RipRepository (ReposReader* rr);
00084   void PrintStatistics(ostream& o);
00085 
00086   WebLinkGraph* PublishWebGraph() { return gb->UndockWebGraph(); }
00087   GraphBuilder* GetGraphBuilder() { return gb; }
00088 
00089   vector<string> rep_files_;
00090 
00091   // *NEW HANDLERS* - Add command line flags and values here
00092   struct {
00093     int stop_after;    ///< if non-zero, stop processing after this many docs
00094     int string_memory;    ///< memory to reserve for string table
00095     int jumptable_memory;    ///< memory to reserve for jumptable
00096     int nodetable_memory;    ///< memory to reserve for nodetable
00097     int leaftable_memory;   ///< memory to reserve for leaftable
00098     int start_ID;     ///< starting id number for WebNodes
00099     int pvm_numtasks; ///< number of pvm tasks in group
00100     bool pvm_is_master; ///< slave or master?
00101     bool repos_from_stdin; 
00102     bool interactive; ///< is ripper interactive or not?
00103     bool print_index; ///< prints the index of webnodes to a file
00104     bool handler_cat;  ///< simple handler to "cat" repository
00105     bool handler_caturl; ///< even simpler handler to "cat" just urls
00106     bool handler_catdate; ///< show date of the document
00107     bool handler_graph_print; ///< prints the web graph
00108     bool handler_catlinks; ///< display anchor links as they are being processed
00109     bool no_graphbuilder; ///< don't build the web link graph
00110   } flags_;
00111 
00112   char *rippername;
00113   char *tempdir;
00114   ofstream indexout;
00115 
00116  private:
00117   // list of parse-handlers to call for each document
00118   vector<ParseHandler*> parsehandlers_; 
00119   // this class builds the web graph from the repositories
00120   GraphBuilder* gb;
00121 };
00122 
00123 Ripper::Ripper() {
00124   flags_.stop_after = 0;
00125   flags_.start_ID = 0;
00126 
00127   flags_.string_memory  = 45; // Mb
00128   flags_.jumptable_memory  = 35; // Mb
00129   flags_.nodetable_memory  = 10; // Mb
00130   flags_.leaftable_memory  = 10; // Mb
00131 
00132   flags_.interactive = true;
00133   flags_.no_graphbuilder = false;
00134   flags_.repos_from_stdin = false;
00135 
00136   flags_.handler_cat = false;
00137   flags_.handler_caturl = false;
00138   flags_.handler_catdate = false;
00139   flags_.handler_graph_print = false;
00140   flags_.handler_catlinks = false;
00141 
00142   flags_.print_index = false;
00143 
00144   flags_.pvm_is_master = true;
00145   flags_.pvm_numtasks = 0;
00146 
00147   rippername = defaultrippername;
00148   tempdir = defaulttempdir;
00149 
00150   //  indexout = NULL;
00151   gb = NULL;
00152 }
00153 
00154 Ripper::~Ripper() {
00155   delete gb;
00156   // delete all parse handlers
00157   for (vector<ParseHandler*>::iterator ph = parsehandlers_.begin();
00158        ph != parsehandlers_.end(); ++ph)
00159     delete *ph;
00160 }
00161 
00162 void Ripper::SetupHandlers () {
00163 
00164   if(!flags_.no_graphbuilder) {
00165     gb = new GraphBuilder(flags_.string_memory, flags_.jumptable_memory,
00166                           flags_.nodetable_memory, flags_.leaftable_memory,
00167                           flags_.handler_catlinks);
00168   }
00169 
00170   if (flags_.handler_cat) {
00171     extern ParseHandler* MakeCatHandler();
00172     parsehandlers_.push_back(MakeCatHandler());
00173   }
00174 
00175   if (flags_.handler_caturl) {
00176     extern ParseHandler* MakeCatURLHandler();
00177     parsehandlers_.push_back(MakeCatURLHandler());
00178   }
00179 
00180   if (flags_.handler_catdate) {
00181     extern ParseHandler* MakeCatDateHandler();
00182     parsehandlers_.push_back(MakeCatDateHandler());
00183   }
00184   
00185   if (!flags_.no_graphbuilder) { // always make graph
00186     extern ParseHandler* MakeGraphHandler(GraphBuilder *graphbuilder);
00187     parsehandlers_.push_back(MakeGraphHandler(gb));
00188   }
00189 
00190   if (flags_.print_index) {
00191     if( !*rippername || (strlen(rippername) > RIPPER_NAMELEN) ) {
00192       cerr << "warning: index won't be saved. You must give ripper a short name." << endl;
00193       usage();
00194     } else {
00195 
00196       ostringstream nambuf;
00197       nambuf << tempdir << "ripper." << rippername << ".index";
00198       cerr << "info: index saved in file " << nambuf.str() << endl;
00199       indexout.open(nambuf.str().c_str());
00200       if( indexout.fail() ) {
00201         cerr << "error: problem opening index file [" << nambuf.str() << "]" << endl;
00202       } 
00203     }
00204   }
00205   if (parsehandlers_.size() == 0) {
00206     usage();
00207   }
00208 
00209 }
00210 
00211 /// reads each document and builds a WebNode from
00212 /// it through GraphBuilder
00213 void Ripper::RipRepository (ReposReader* rr) {
00214   if( flags_.no_graphbuilder ) {
00215 
00216     while (!rr->AtEnd() && 
00217            (flags_.stop_after == 0 || num_docs_processed < flags_.stop_after)) {
00218       ParseElt::Process_Document(rr, &parsehandlers_);
00219       num_docs_processed++;
00220     }
00221 
00222   } else {
00223 
00224     while (!rr->AtEnd() && 
00225            (flags_.stop_after == 0 || num_docs_processed < flags_.stop_after)) {
00226       gb->NodeInitialize(num_docs_processed);
00227       ParseElt::Process_Document(rr, &parsehandlers_);
00228       gb->NodeInsertLinks();
00229       if( flags_.print_index && indexout) {
00230          indexout << gb->NodeGetURL() << " " 
00231                   << gb->NodeGetID() << " "
00232                   << gb->NodeGetDate() << " " << gb->NodeGetAlias()
00233            //             << ((gb->NodeGetAlias()) ? gb->NodeGetAlias() : "") 
00234                   << " ** " << gb->NodeGetURL_() << " " << ((gb->NodeGetAlias_()) ? gb->NodeGetAlias_() : "")
00235 << endl;
00236       }
00237       gb->NodeLaunch();
00238       num_docs_processed++;
00239     }  
00240 
00241   }
00242 }
00243 
00244 /// prints out some memory usage statistics.
00245 /// Useful for out of memory errors.
00246 void Ripper::PrintStatistics(ostream& o) {
00247   o << "Number of documents processed:           " << num_docs_processed << endl;
00248   if(gb) 
00249     gb->StatisticsMem(o);
00250 }
00251 
00252 /// sets flags for all the possible command line options
00253 void Ripper::ParseCmdLineArgs(int argc, char** argv) {
00254   argv++;
00255   for (int i = 1; i < argc; ++i, ++argv) {
00256     if ((*argv)[0] == '-' && (*argv)[1] != '\0') {  // option
00257       if (!strcmp(*argv, "--stop_after")) {
00258         ++i;
00259         ++argv;
00260         if (i >= argc) {
00261           usage();
00262         }
00263         flags_.stop_after = atoi(*argv);
00264       } else if (!strcmp(*argv, "--cat")) {
00265         flags_.handler_cat = true;
00266       } else if (!strcmp(*argv, "--caturl")) {
00267         flags_.handler_caturl = true;
00268       } 
00269       // *NEW HANDLERS* - add command line processing here
00270       else if (!strcmp(*argv, "--no_graphbuilder")) {
00271         flags_.no_graphbuilder = true;
00272       } else if (!strcmp(*argv, "--catdate")) {
00273         flags_.handler_catdate = true;
00274       } else if (!strcmp(*argv, "--quit")) {
00275         flags_.interactive = false;
00276       } else if (!strcmp(*argv, "--catlinks")) {
00277         flags_.handler_catlinks = true;
00278       } else if (!strcmp(*argv, "--save_graph")) {
00279         flags_.handler_graph_print = true;
00280       } else if (!strcmp(*argv, "--save_index")) {
00281         flags_.print_index = true;
00282       } else if (!strcmp(*argv, "--string_memory")) {
00283         ++i;
00284         ++argv;
00285         if (i >= argc) {
00286           usage();
00287         }
00288         flags_.string_memory = atoi(*argv);
00289       } else if (!strcmp(*argv, "--jumptable_memory")) {
00290         ++i;
00291         ++argv;
00292         if (i >= argc) {
00293           usage();
00294         }
00295         flags_.jumptable_memory = atoi(*argv);
00296       } else if (!strcmp(*argv, "--nodetable_memory")) {
00297         ++i;
00298         ++argv;
00299         if (i >= argc) {
00300           usage();
00301         }
00302         flags_.nodetable_memory = atoi(*argv);
00303       } else if (!strcmp(*argv, "--leaftable_memory")) {
00304         ++i;
00305         ++argv;
00306         if (i >= argc) {
00307           usage();
00308         }
00309         flags_.leaftable_memory = atoi(*argv);
00310       } else if (!strcmp(*argv, "--name")) {
00311         ++i;
00312         ++argv;
00313         if (i >= argc) {
00314           usage();
00315         }
00316         rippername = *argv;
00317         if( strlen(rippername) > RIPPER_NAMELEN) {
00318           cerr << "Name too long, please use a shorter one." << endl;
00319           usage();
00320         }
00321       } else if (!strcmp(*argv, "--temp_dir")) {
00322         ++i;
00323         ++argv;
00324         if (i >= argc) {
00325           usage();
00326         }
00327         tempdir = *argv;
00328         if( strlen(tempdir) > RIPPER_TMPDIRLEN) {
00329           cerr << "Please use a shorter temporary directory name." << endl;
00330           usage();
00331         }
00332       } else if (!strcmp(*argv, "--pvm_master")) {
00333         ++i;
00334         ++argv;
00335         if (i >= argc) {
00336           usage();
00337         }
00338         flags_.pvm_numtasks = atoi(*argv);
00339         flags_.pvm_is_master = true;
00340       } else if (!strcmp(*argv, "--pvm_slave")) {
00341         ++i;
00342         ++argv;
00343         if (i >= argc) {
00344           usage();
00345         }
00346         flags_.pvm_numtasks = atoi(*argv);
00347         flags_.pvm_is_master = false;
00348       } else if (!strcmp(*argv, "--start_after")) {
00349         ++i;
00350         ++argv;
00351         if (i >= argc) {
00352           usage();
00353         }
00354         flags_.start_ID = atoi(*argv);
00355         num_docs_processed = flags_.start_ID;
00356       } else 
00357         usage();
00358     } else { // repository file
00359       if ((*argv)[0] == '-' && (*argv)[1] == '\0') {
00360         flags_.repos_from_stdin = true;
00361       } else {
00362         rep_files_.push_back(string(*argv));
00363       }
00364     }
00365   }
00366   if (!flags_.repos_from_stdin && rep_files_.empty()) {
00367     usage();
00368   }
00369   if (flags_.repos_from_stdin && !rep_files_.empty()) {
00370     cerr << "Specify only one source of repository input (files or stdin)"
00371          << std::endl;
00372     usage();
00373   }
00374 }
00375 
00376 Ripper *ripper = NULL;
00377 
00378 /// Prints statistics when we run out of memory
00379 void OutOfMemory() {
00380   cerr << "ran out of memory" << endl;
00381   if( ripper ) {
00382     ripper->PrintStatistics(cerr);
00383   }
00384   abort();
00385 }
00386 
00387 
00388 /// Reads the document repository and produces
00389 /// a WebNode object for each read document. This
00390 /// is handled by GraphBuilder. After the WebNodeList
00391 /// (web link graph) is constructed, enters optionally
00392 /// into interactive mode, by communicating with an
00393 /// external program (jack) through pipes. 
00394 int main(int argc, char** argv) {
00395 
00396 
00397   set_new_handler(OutOfMemory);
00398   ripper = new Ripper();
00399 
00400   cerr << "Welcome to the Google Programming Contest ripper." << std::endl
00401        << "Please see the file LICENSE for terms of use of "
00402        << "the data and code." << std::endl;
00403 
00404   ripper->ParseCmdLineArgs(argc, argv);
00405   ripper->SetupHandlers(); // opens index file
00406 
00407   // header for index file
00408   if (!ripper->flags_.no_graphbuilder &&
00409       ripper->flags_.print_index && ripper->indexout ) {
00410     ripper->indexout << "# Ripper: " << ripper->rippername << endl
00411                      << "# url | id | date | alias_url" << endl;
00412   }
00413 
00414   // this may take a long time to run
00415   if (ripper->flags_.repos_from_stdin) {
00416     ReposReader reprdr(&cin, string("<stdin>"));
00417     ripper->RipRepository(&reprdr);
00418   } else {
00419     for (vector<string>::iterator repname = ripper->rep_files_.begin();
00420          repname != ripper->rep_files_.end(); ++repname) {
00421       std::ifstream repstream((*repname).c_str());
00422       if (! repstream) {
00423         cerr << "error: Cannot open repository file " << *repname 
00424              << ", skipping it" << std::endl;
00425       } else {
00426         ReposReader reprdr(&repstream, *repname);
00427         ripper->RipRepository(&reprdr);
00428       }
00429       repstream.close();
00430     }
00431   }
00432 
00433   // close index file
00434   if (!ripper->flags_.no_graphbuilder &&
00435       ripper->flags_.print_index && ripper->indexout ) {
00436     ripper->indexout.close();
00437   }
00438 
00439   // interactive mode needs two fifos
00440   if(!ripper->flags_.no_graphbuilder && ripper->flags_.interactive) {
00441 
00442     cerr << "tempdir:" << ripper->tempdir <<endl;
00443     Talker *talker = new Talker(ripper->rippername, ripper->tempdir,
00444                                 ripper->PublishWebGraph(), 
00445                                 ripper->GetGraphBuilder(),
00446                                 ripper->flags_.pvm_is_master,
00447                                 ripper->flags_.pvm_numtasks);
00448 
00449     talker->Talk();
00450     delete talker;
00451 
00452   } else { // not interactive
00453     if( !ripper->flags_.no_graphbuilder &&
00454         ripper->flags_.handler_graph_print ) { 
00455       WebLinkGraph *graph = ripper->PublishWebGraph();
00456       graph->PrintWebGraph(cout);
00457     }
00458     ripper->PrintStatistics(cerr);
00459   }
00460 
00461   return 0;
00462 }
00463 

Generated on Wed May 29 11:37:15 2002 for MarkovPR by doxygen1.2.15