: patch to allow addition of URLs for htdig-3.1.5
: conversion: 2000/07/21 uitm@blackflag.ru (original message follows)
: 
: from http://www.htdig.org/mail/1998/04/0046.html
:
:                        patch to allow addition of URLs
:   ------------------------------------------------------------------------
: Edmond Abrahamian (edmond@greencedars.com.lb)
: Wed, 15 Apr 1998 21:53:15 +0200 (EET)
:    * Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
:    * Next message: Olivier PRENANT: "htdig: htdig & apache 1.3xx"
:    * Previous message: Pierre Garriga: "htdig-3.0.8b2"
:   ------------------------------------------------------------------------
: Hi Andrew,
:    I am submitting this small patch for the file htdig/main.cc. It allows
: the option of digging new URLs without having to re-dig all the URLs that
: are already in the database. The patch is included as an attachment, and is
: to be fed into larry wall's patch program's stdin. It will prompt you for
: the file to patch (main.cc) which it will *replace*. I should mention that
: I am talking about the htdig-3.0.8b2 version in this context.
:    Of course, once the new url(s) are digged, we still have to htmerge
: afterwards but we save an awful amount of time by incrementally adding
: URLS rather than digging everything from scratch.
:    I hope I have not grossly overlooked anything. I looked into the whole
: htdig program, after which I concluded that this simple fix should do. I
: hope I am right...
:    I have tested it on one small database and one rather large database,
: with great success. I would hope that if you agree, to give the htdig
: community the chance to beat on the patch while the stuff is still in beta.
:    regards,
:   Edmond Abrahamian (edmond@greencedars.com.lb)
: 202,204c
:     if (!new_urls_only) {
:        List *list = docs.URLs();
:        retriever.Initial(*list);
:        delete list;
:     }
: .
: 200c
:     // URLs to the initial list of the retriever. However do this only
:     // if we are not adding new URLs only (i.e. -n option)
: .
: 196c
:     // seed the retriever object with the list of start URLs, unless
:     // we're requesting to add new URLs only without scanning the ones
:     // already in the database (-n option), in which case we will seed
:     // the retriever with those new URLs only
:     if (!new_urls_only)
:        retriever.Initial(config["start_url"]);
:     else
:        retriever.Initial(config["new_url"]);
: .
: 151a
: .
: 143a
: .
: 142c
:     String l;
:     if (!new_urls_only)
:        l = config["limit_urls_to"];
:     else
:        l = config["limit_new_urls_to"];
: .
: 65a
:             case 'n':
:                 new_urls_only=1;
:                 break;
: .
: 38c
:     while ((c = getopt(ac, av, "sc:vith:u:an")) != -1)
: .
: 33a
:     int new_urls_only = 0;
: .
:   ------------------------------------------------------------------------
:    * Next message: Olivier PRENANT: "htdig: htdig & apache 1.3xx"
:    * Previous message: Pierre Garriga: "htdig-3.0.8b2"
:   ------------------------------------------------------------------------
:This archive was generated by hypermail 2.0b3 on Sat Jan 02 1999 - 16:26:01 PST


--- htdig.cc.orig	Fri Feb 25 05:29:10 2000
+++ htdig.cc	Fri Jul 21 19:00:47 2000
@@ -33,6 +33,7 @@
 FILE			*urls_seen = NULL;
 FILE			*images_seen = NULL;
 String			configFile = DEFAULT_CONFIG_FILE;
+int			new_urls_only = 0;
 
 void usage();
 void reportError(char *msg);
@@ -55,7 +56,7 @@
     //
     // Parse command line arguments
     //
-    while ((c = getopt(ac, av, "lsc:vith:u:a")) != -1)
+    while ((c = getopt(ac, av, "lsc:vith:u:an")) != -1)
     {
         int pos;
 	switch (c)
@@ -89,6 +90,9 @@
 	    case 'l':
 		flag = Retriever_logUrl;
 		break;
+            case 'n':
+                new_urls_only = 1;
+                break;
 	    case '?':
 		usage();
 	}
@@ -184,7 +188,11 @@
     //
     // Set up the limits list
     //
-    StringList l(config["limit_urls_to"], " \t");
+    StringList l;
+    if (new_urls_only == 0)
+	l.Create(config["limit_urls_to"], " \t");
+    else
+	l.Create(config["limit_new_urls_to"], " \t");
     limits.IgnoreCase();
     limits.Pattern(l.Join('|'));
     l.Release();
@@ -234,19 +242,32 @@
     //
     // Create the Retriever object which we will use to parse all the
     // HTML files.
-    // In case this is just an update dig, we will add all existing
-    // URLs?
-    //
     Retriever	retriever(flag);
-    List	*list = docs.URLs();
-    retriever.Initial(*list);
-    delete list;
-
-    // Add start_url to the initial list of the retriever.
-    // Don't check a URL twice!
-    // Beware order is important, if this bugs you could change 
-    // previous line retriever.Initial(*list, 0) to Initial(*list,1)
-    retriever.Initial(config["start_url"], 1);
+
+    // In case this is just an update dig, we will add all existing
+    // URLs to the initial list of the retriever. However do this only
+    // if we are not adding new URLs only (i.e. -n option)
+
+    if (new_urls_only == 0)
+    {
+	List	*list = docs.URLs();
+	retriever.Initial(*list);
+	delete list;
+    }
+    else
+    if (debug)
+    {
+	cout << "adding new URLs only (-n option given)" << endl;
+    }
+
+    // Seed the retriever object with the list of start URLs, unless
+    // we're requesting to add new URLs only without scanning the ones
+    // already in the database (-n option), in which case we will seed
+    // the retriever with those new URLs only
+    if (new_urls_only == 0)
+       retriever.Initial(config["start_url"], 1);
+    else
+       retriever.Initial(config["new_url"], 1);
 
     //
     // Go do it!

