This patch extends the external_parsers attribute to allow definition of
external converters, which convert one content-type to another.

--- htdig-3.1.3/htdig/ExternalParser.cc.noconv	Wed Sep 22 11:18:40 1999
+++ htdig-3.1.3/htdig/ExternalParser.cc	Tue Oct 19 16:40:09 1999
@@ -11,6 +11,9 @@ static char RCSid[] = "$Id: ExternalPars
 #endif
 
 #include "ExternalParser.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "PDF.h"
 #include "htdig.h"
 #include "htString.h"
 #include "QuotedStringList.h"
@@ -21,6 +24,7 @@ static char RCSid[] = "$Id: ExternalPars
 #include "good_strtok.h"
 
 static Dictionary	*parsers = 0;
+static Dictionary	*toTypes = 0;
 extern String		configFile;
 
 //*****************************************************************************
@@ -88,13 +92,25 @@ ExternalParser::canParse(char *contentTy
     if (!parsers)
     {
 	parsers = new Dictionary();
+	toTypes = new Dictionary();
 	
 	QuotedStringList	qsl(config["external_parsers"], " \t");
+	String			from, to;
 	int			i;
+	int			sep;
 
 	for (i = 0; qsl[i]; i += 2)
 	{
-	    parsers->Add(qsl[i], new String(qsl[i + 1]));
+	    from = qsl[i];
+	    to = "";
+	    sep = from.indexOf("->");
+	    if (sep != -1)
+	    {
+		to = from.sub(sep+2).get();
+		from = from.sub(0, sep).get();
+	    }
+	    parsers->Add(from, new String(qsl[i + 1]));
+	    toTypes->Add(from, new String(to));
 	}
     }
     return parsers->Exists(contentType);
@@ -150,8 +166,45 @@ ExternalParser::parse(Retriever &retriev
     char	*token1, *token2, *token3;
     int		loc, hd;
     URL		url;
+    String	convertToType = ((String *)toTypes->Find(contentType))->get();
+    int		get_hdr = (mystrcasecmp(convertToType, "user-defined") == 0);
+    int		get_file = (convertToType.length() != 0);
+    String	newcontent;
     while (readLine(input, line))
     {
+	if (get_hdr)
+	{
+	    line.chop('\r');
+	    if (line.length() == 0)
+		get_hdr = FALSE;
+	    else if (mystrncasecmp(line, "content-type:", 13) == 0)
+	    {
+		token1 = line.get() + 13;
+		while (*token1 && isspace(*token1))
+		    token1++;
+		token1 = strtok(token1, "\n\t");
+		convertToType = token1;
+	    }
+	    continue;
+	}
+	if (get_file)
+	{
+	    if (newcontent.length() == 0 &&
+		!canParse(convertToType) &&
+		mystrncasecmp(convertToType, "text/", 5) != 0 &&
+		mystrncasecmp(convertToType, "application/pdf", 15) != 0)
+	    {
+		if (mystrcasecmp(convertToType, "user-defined") == 0)
+		    cerr << "External parser error: no Content-Type given\n";
+		else
+		    cerr << "External parser error: can't parse Content-Type \""
+			 << convertToType << "\"\n";
+		cerr << " URL: " << base.get() << "\n";
+		break;
+	    }
+	    newcontent << line << '\n';
+	    continue;
+	}
 	token1 = strtok(line, "\t");
 	if (token1 == NULL)
 	    token1 = "";
@@ -340,6 +393,50 @@ ExternalParser::parse(Retriever &retriev
     }
     pclose(input);
     unlink(path);
+
+    if (newcontent.length() > 0)
+    {
+	static HTML			*html = 0;
+	static Plaintext		*plaintext = 0;
+	static PDF			*pdf = 0;
+	Parsable			*parsable = 0;
+
+	contentType = convertToType;
+	if (canParse(contentType))
+	{
+	    currentParser = ((String *)parsers->Find(contentType))->get();
+	    parsable = this;
+	}
+	else if (mystrncasecmp(contentType, "text/html", 9) == 0)
+	{
+	    if (!html)
+		html = new HTML();
+	    parsable = html;
+	}
+	else if (mystrncasecmp(contentType, "text/plain", 10) == 0)
+	{
+	    if (!plaintext)
+		plaintext = new Plaintext();
+	    parsable = plaintext;
+	}
+	else if (mystrncasecmp(contentType, "application/pdf", 15) == 0)
+	{
+	    if (!pdf)
+		pdf = new PDF();
+	    parsable = pdf;
+	}
+	else
+	{
+	    if (!plaintext)
+		plaintext = new Plaintext();
+	    parsable = plaintext;
+	    if (debug)
+		cout << "External parser error: \"" << contentType <<
+			"\" not a recognized type.  Assuming text\n";
+	}
+	parsable->setContents(newcontent.get(), newcontent.length());
+	parsable->parse(retriever, base);
+    }
 }
 
 
--- htdig-3.1.3/htdoc/attrs.html.noconv	Wed Sep 22 11:18:41 1999
+++ htdig-3.1.3/htdoc/attrs.html	Wed Oct 20 11:37:52 1999
@@ -1625,9 +1625,29 @@
 			content-type that the parser can handle while the
 			second string of each pair is the path to the external
 			parsing program. If quoted, it may contain parameters,
-			separated by spaces.<p>
+			separated by spaces.<br>
+			 External parsing can also be done with external
+			converters, which convert one content-type to
+			another. To do this, instead of just specifying
+			a single content-type as the first string
+			of a pair, you specify two types, in the form
+			<em>type1</em><strong>-&gt;</strong><em>type2</em>,
+			as a single string with no spaces. The second
+			string will define an external converter
+			rather than an external parser, to convert
+			the first type to the second. If the second
+			type is <strong>user-defined</strong>, then
+			it's up to the converter script to put out a
+			"Content-Type:&nbsp;<em>type</em>" header followed
+			by a blank line, to indicate to htdig what type it
+			should expect for the output, much like what a CGI
+			script would do. The resulting content-type must
+			be one that htdig can parse, either internally,
+			or with another external parser or converter.<br>
+			 Only one external parser or converter can be
+			specified for any given content-type.<p>
 			 The parser program takes four command-line
-			parameters, not counting parameters and parameters
+			parameters, not counting any parameters already
 			given in the command string:<br>
 			<em>infile content-type URL configuration-file</em><br>
 			<table border="1">
@@ -1688,7 +1708,10 @@
 			  </tr>
 			</table><p>
 			The external parser is to write information for
-			htdig on its standard output.<br>
+			htdig on its standard output. Unless it is an
+			external converter, which will output a document
+			of a different content-type, then its output must
+			follow the format described here.<br>
 			 The output consists of records, each record terminated
 			with a newline. Each record is a series of (unless
 			expressively allowed to be empty) non-empty tab-separated
@@ -1927,7 +1950,9 @@
 				</td>
 				<td nowrap>
 				  text/html /usr/local/bin/htmlparser \<br>
-				  application/ms-word "/usr/local/bin/mswordparser -w"
+				  application/pdf /usr/local/bin/parse_doc.pl \<br>
+				  application/msword-&gt;text/plain "/usr/local/bin/mswordtotxt -w" \<br>
+				  application/x-gunzip-&gt;user-defined /usr/local/bin/ungzipper
 				</td>
 			  </tr>
 			</table>



