From: Geoff Hutchison <ghutchis@wso.williams.edu>
To: Joe R. Jah <jjah@cloud.ccsf.cc.ca.us>
Cc: htdig-general@lists.sourceforge.net
Subject: Re: [htdig] Redirected broken URL's self refer;(

At 6:38 PM -0800 2/7/02, Joe R. Jah wrote:
>Is it possible to have htdig disregards such redirections and report:
>
>Not found: http//canonical.com/canonical/path/broken.html Ref: 
>http//canonical.com/canonical/path/culprit.html

Yeah, this would fit my definition of a minor bug. I'd imagine it's 
been around for ages since this code hasn't changed much even since 
3.0.8b2 and when I first discovered ht://Dig.

I haven't tested this patch, but give it a whirl, it should solve the 
problem. Usual disclaimers apply in regards to using untested 
patches. ;-)

-- 
--
-Geoff Hutchison
Williams Students Online
http://wso.williams.edu/

Index: htdig/Retriever.cc
===================================================================
RCS file: /cvsroot/htdig/htdig/htdig/Retriever.cc,v
retrieving revision 1.36.2.28
diff -c -3 -p -r1.36.2.28 Retriever.cc
*** htdig/Retriever.cc	25 Jan 2002 04:44:33 -0000	1.36.2.28
--- htdig/Retriever.cc	9 Feb 2002 03:26:57 -0000
*************** Retriever::parse_url(URLRef &urlRef)
*** 531,537 ****
  	    if (debug)
  		cout << " redirect" << endl;
  	    words.MarkGone();
! 	    got_redirect(doc->Redirected(), ref);
  	    break;
  	    
         case Document::Document_not_authorized:
--- 531,537 ----
  	    if (debug)
  		cout << " redirect" << endl;
  	    words.MarkGone();
! 	    got_redirect(doc->Redirected(), ref, urlRef.Referer());
  	    break;
  	    
         case Document::Document_not_authorized:
*************** Retriever::got_href(URL &url, char *desc
*** 1418,1424 ****
  // void Retriever::got_redirect(char *new_url, DocumentRef *old_ref)
  //
  void
! Retriever::got_redirect(char *new_url, DocumentRef *old_ref)
  {
      // First we must piece together the new URL, which may be relative
      URL parent(old_ref->DocURL());
--- 1418,1424 ----
  // void Retriever::got_redirect(char *new_url, DocumentRef *old_ref)
  //
  void
! Retriever::got_redirect(char *new_url, DocumentRef *old_ref, char *referer)
  {
      // First we must piece together the new URL, which may be relative
      URL parent(old_ref->DocURL());
*************** Retriever::got_redirect(char *new_url, D
*** 1516,1523 ****
  		    servers.Add(url.signature(), server);
  		    delete localRobotsFile;
  		}
! 		server->push(url.get(), ref->DocHopCount(), base->get(),
! 				IsLocalURL(url.get()));
  
  		String	temp = url.get();
  		visited.Add(temp, 0);
--- 1516,1527 ----
  		    servers.Add(url.signature(), server);
  		    delete localRobotsFile;
  		}
! 		if (!referer || strlen(referer) == 0)
! 		  server->push(url.get(), ref->DocHopCount(), base->get(),
! 			       IsLocalURL(url.get()));
! 		else
! 		  server->push(url.get(), ref->DocHopCount(), referer,
!                                IsLocalURL(url.get()));
  
  		String	temp = url.get();
  		visited.Add(temp, 0);
Index: htdig/Retriever.h
===================================================================
RCS file: /cvsroot/htdig/htdig/htdig/Retriever.h,v
retrieving revision 1.8.2.4
diff -c -3 -p -r1.8.2.4 Retriever.h
*** htdig/Retriever.h	7 Jun 2001 22:11:29 -0000	1.8.2.4
--- htdig/Retriever.h	9 Feb 2002 03:26:57 -0000
*************** private:
*** 130,136 ****
      int			IsValidURL(char *url);
      void		RetrievedDocument(Document &, char *url, DocumentRef *ref);
      void		parse_url(URLRef &urlRef);
!     void		got_redirect(char *, DocumentRef *);
      void		recordNotFound(char *url, char *referer, int reason);
  };
  
--- 130,136 ----
      int			IsValidURL(char *url);
      void		RetrievedDocument(Document &, char *url, DocumentRef *ref);
      void		parse_url(URLRef &urlRef);
!     void		got_redirect(char *newURL, DocumentRef *, char *referer = 0);
      void		recordNotFound(char *url, char *referer, int reason);
  };
  

