Date: 05 May 2003 16:15:06 +0200 From: Gabriele Bartolini To: "ht://Dig - Dev" Subject: [htdig-dev] Regex patch for 3.1.6 - ready Hi guys, I got to make it work today ... If some of you could please put in the patch archive ... :-) Basically it is possibile to specify a regular expression in the restrict and exclude attribute, as: restrict: [^http://www.comune.prato.it/\$] Ciao ciao -Gabriele -- Gabriele Bartolini - Web Programmer Comune di Prato - Prato - Tuscany - Italy g.bartol@comune.prato.it | http://www.comune.prato.it > find bin/laden -name osama -exec rm {} ; diff -3upr ../htdig-3.1.6/htsearch/Display.cc ./htsearch/Display.cc --- ../htdig-3.1.6/htsearch/Display.cc Fri Feb 1 00:47:18 2002 +++ ./htsearch/Display.cc Mon May 5 16:06:34 2003 @@ -212,15 +212,14 @@ Display::display(int pageNumber) int Display::includeURL(char *url) { - if (limitTo && limitTo->FindFirst(url) < 0) + if (limitTo && limitTo->match(url, 1, 0) == 0) { return 0; } else { if (excludeFrom && - excludeFrom->hasPattern() && - excludeFrom->FindFirst(url) >= 0) + excludeFrom->match(url, 0, 0) != 0) return 0; else return 1; Only in ./htsearch: Display.cc.orig diff -3upr ../htdig-3.1.6/htsearch/Display.h ./htsearch/Display.h --- ../htdig-3.1.6/htsearch/Display.h Fri Feb 1 00:47:18 2002 +++ ./htsearch/Display.h Mon May 5 16:06:34 2003 @@ -18,6 +18,7 @@ #include "DocumentDB.h" #include "Database.h" #include "Dictionary.h" +#include "HtRegex.h" class Display : public Object { @@ -34,8 +35,8 @@ public: void setResults(ResultList *results); void setSearchWords(List *searchWords); - void setLimit(StringMatch *); - void setExclude(StringMatch *); + void setLimit(HtRegex *); + void setExclude(HtRegex *); void setAllWordsPattern(StringMatch *); void setLogicalWords(char *); void setOriginalWords(char *); @@ -82,8 +83,8 @@ protected: // // Pattern that all result URLs must match or exclude // - StringMatch *limitTo; - StringMatch *excludeFrom; + HtRegex *limitTo; + HtRegex *excludeFrom; // // Pattern of all the words @@ -173,13 +174,13 @@ protected: //***************************************************************************** inline void -Display::setLimit(StringMatch *limit) +Display::setLimit(HtRegex *limit) { limitTo = limit; } inline void -Display::setExclude(StringMatch *exclude) +Display::setExclude(HtRegex *exclude) { excludeFrom = exclude; } Only in ./htsearch: Display.h.orig diff -3upr ../htdig-3.1.6/htsearch/htsearch.cc ./htsearch/htsearch.cc --- ../htdig-3.1.6/htsearch/htsearch.cc Fri Feb 1 00:47:18 2002 +++ ./htsearch/htsearch.cc Mon May 5 16:07:26 2003 @@ -27,6 +27,7 @@ static char RCSid[] = "$Id: htsearch.cc, #include "HtURLCodec.h" #include "HtURLRewriter.h" #include "HtWordType.h" +#include "HtRegex.h" // If we have this, we probably want it. #ifdef HAVE_GETOPT_H @@ -62,8 +63,8 @@ main(int ac, char **av) List searchWords; String configFile = DEFAULT_CONFIG_FILE; int pageNumber = 1; - StringMatch limit_to; - StringMatch exclude_these; + HtRegex limit_to; + HtRegex exclude_these; String logicalWords; String origPattern; String logicalPattern; @@ -215,13 +216,13 @@ main(int ac, char **av) if (strlen(config["restrict"])) { - // Create a temporary list from either the configuration - // file or the input parameter - urllist.Create(config["restrict"], "| \t\r\n\001"); - urlpat = urllist.Join('|'); - urllist.Release(); // release the temporary list of URLs - config.Add("restrict", urlpat); // re-create the config attribute - limit_to.Pattern(urlpat); // Set the new limit pattern + // Create a temporary list from either the configuration + // file or the input parameter + urllist.Create(config["restrict"], "| \t\r\n\001"); + limit_to.setEscaped(urllist); // Set the new limit pattern + urlpat = urllist.Join('|'); + config.Add("restrict", urlpat); // re-create the config attribute + urllist.Release(); // release the temporary list of URLs } if (strlen(config["exclude"])) @@ -229,10 +230,10 @@ main(int ac, char **av) // Create a temporary list from either the configuration // file or the input parameter urllist.Create(config["exclude"], "| \t\r\n\001"); + exclude_these.setEscaped(urllist); urlpat = urllist.Join('|'); - urllist.Release(); // release the temporary list of URLs config.Add("exclude", urlpat); // re-create the config attribute - exclude_these.Pattern(urlpat); + urllist.Release(); // release the temporary list of URLs } // Ctype-like functions for what constitutes a word.