/*
 * ngmatch - newsgroup name matching (faster tree-walking version)
 *
 * ngmatch returns true iff the newsgroup(s) in ngs match
 * the pattern(s) in ngpat, where
 *
 * 	ngpats: { ngpat { "," ngpat }* }?
 *	ngpat: "!"? word { "." word }*
 *	word: { alphanum }+ | "all"
 *
 * Only one group need match for success.
 * Failure to match any pattern with a group is a mismatch of that group only.
 * Failure to match any group against any pattern is a total failure.
 *
 * For each group, note the depth of each match against the patterns,
 * negated or not.  Ignore mismatches.  The deepest match wins at the
 * end:  a deeper negated match is a failure, a deeper plain match is a
 * success; a tie is a failure, but see the description of wildcards.
 *
 * "all" in a pattern is a wildcard that matches exactly one word;
 * it does not cross "." (NGDELIM) delimiters.  A non-wildcard match is
 * considered to have matched slightly deeper than a wildcard match in the
 * same position.  This ensures that !alt.sex.all,alt.sex.bondage matches
 * alt.sex.bondage.
 *
 * The obvious implementation involves (number of groups)*(number of
 * patterns) comparisons (unless you get an early match), which can get
 * quite slow, particular when the number of patterns gets large (e.g. when
 * feeding neurotic or cheap neighbours who want *this* and *this* but not
 * *that* oh except for *this*).  This implementation parses the patterns
 * into a tree with common prefixes merged, then walks that tree for each
 * group, so it should only take about 2*(number of groups) comparisons
 * (due to wildcards).  It's actually a little worse in the worst case due
 * to backtracking due to wildcards that appear in non-leaf nodes.  This is
 * a win for complex sys files and probably isn't much worse for trivial
 * ones.  Each node contains a message telling us whether a match at this
 * node is a plain or negated match (i.e. is a match or not).  Barry Shein
 * contributed to this design.
 *
 * Ken Thompson was right: when in doubt, use brute force.  Attempts to
 * use hashing and sorting in this code have slowed it down.  Sheer, brute
 * linear search is fastest for real, live patterns, since 88% of tree nodes
 * have 0 or 1 children (72% have 0).
 *
 * Essentially all the expense is in parsing, matching is pretty fast.
 * Thus preparsing really pays off; using ngmatch isn't a win.
 */

#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include "news.h"
#include "ngmatch.h"

#define truth(bool) ((bool)? "yes": "no")
#define LENBIT(len) ((len) > 15? 1: 1 << (len))

/* tunable parameters */
#define RETAIN 300		/* retain & recycle this many patnodes */

/* fundamental constants */
#define UNKNOWN (YES+1)			/* no message attached to this node */
#define ALL "all"			/* word wildcard */

NGPAT {
	NGPAT	*n_next;		/* both next kid and next free node */
	NGPAT	*n_kids;		/* children of this node */
	char	*n_word;		/* atom name */
	short	n_lenmask;		/* bitmask of lengths of self+sibs */
	char	n_match;		/* ternary message */
	char	n_len;			/* length of n_word */
#ifdef STATS
	char	n_nkids;
#endif
};

/* exports */
char *ngerr = NULL;			/* if non-NULL, contains error string */

/* private */
static boolean debug = NO;
static NGPAT *patreuse = NULL;
static int reusables = 0;
/* this cache really does help; it's been measured. */
#define CACHEWORDS 20	/* currently high water mark is 7 (10 nov 1991) */
			/* (sci.physics.edward.teller.boom.boom.boom) */
static struct wordcache {
	char	*word;
	NGPAT	*node;
	char	len;
} wordcache[CACHEWORDS];
static int cachedepth = 0;
/* a bit ugly, but these save tree nodes and time. */
static NGPAT *lastnp = NULL, *tree = NULL;
#ifdef STATS
#define MAXKIDS 20			/* maximum kids to record in stats */
static long kids[MAXKIDS];
#endif

/* forwards */
FORWARD nginprpat();
FORWARD int treematch();

void
matchdebug(state)
boolean state;
{
	debug = state;
}

STATIC NGPAT *
patalloc()					/* allocate a pattern node */
{
	register NGPAT *np = patreuse;

	if (np == NULL)
		return (NGPAT *)nemalloc(sizeof *np);
	else {
		/* pull the first reusable one off the pile */
		patreuse = np->n_next;
		reusables--;
		return np;
	}
}

STATIC
patfree(np)					/* free a pattern node */
register NGPAT *np;
{
	if (reusables >= RETAIN)		/* compost heap is full? */
		free((char *)np);		/* yup, just pitch this one */
	else {					/* no, just stash for reuse */
		++reusables;
		np->n_next = patreuse;
		patreuse = np;
	}
}


/*
 * search tree from np through np's siblings for word of length len.
 * most words (95%) are not found.
 * using word length as a pre-filter seems to be effective at
 * reducing the number of expensive strcmps because length of words
 * in newsgroup names is fairly evenly distributed across the range 3-8.
 * using the masks of lengths extends this pre-filter to all succeeding
 * nodes without having to examine them all.
 *
 * This function is a profiling hot spot.
 */
STATIC NGPAT *
treesearch(np, word, len)
register NGPAT *np;			/* better not be NULL */
register char *word;
register int len;
{
	register char *npword;
	register char word1stc = word[0];

	do {
		if (len == np->n_len) {
			/* in-line STREQ for greater speed */
			npword = np->n_word;
			if (*npword == word1stc && strcmp(npword, word) == 0)
				return np;
		}
	} while ((np = np->n_next) != NULL);
	return NULL;
}

/*
 * add an atom next to curnp for "word", unless it's already there;
 * return a pointer to the node for "word".
 * keep a cache of (string, tree node) pairs for each level in the tree.
 */
STATIC NGPAT *
addatom(curnp, word, len, level, wcp, cachematchp)
register NGPAT *curnp;			/* may be NULL */
register char *word;
register int len, level;
register struct wordcache *wcp;
register int *cachematchp;
{
	/*
	 * look up this word in the cache (finds 40%).  if absent,
	 * insert it in cache and tree.
	 */
	if (*cachematchp && level < cachedepth && level < CACHEWORDS &&
	    len == wcp->len && STREQ(word, wcp->word))
		return (lastnp = wcp->node);
	else {
		register NGPAT *newnp;
		register int lenmask = LENBIT(len), nxtlenmsk = 0;

		/* only 1.5% of these lookups succeed */
		if (curnp == NULL ||
		    !((nxtlenmsk = curnp->n_lenmask)&lenmask) ||
		    (newnp = treesearch(curnp, word, len)) == NULL) {
			/* the common case */
			if (word[0] == '\0')
				ngerr = "empty word";
			newnp = patalloc();
			newnp->n_match = UNKNOWN;
			newnp->n_word = word;
			newnp->n_len = len;
			newnp->n_lenmask = lenmask;
			newnp->n_kids = NULL;
			/* new node's sibs are this node's kids */
			newnp->n_next = curnp;
			newnp->n_lenmask |= nxtlenmsk;

			/* add new node as kid of previous node, if any */
			if (lastnp != NULL)
				lastnp->n_kids = newnp;
			if (level == 0)
				tree = newnp;
#ifdef STATS
			if (lastnp != NULL)
				lastnp->n_nkids++;
			newnp->n_nkids = 0;
#endif
		}
		*cachematchp = NO;
		if (level < CACHEWORDS) {
			cachedepth = level + 1;
			wcp->word = word;
			wcp->len = len;
			wcp->node = newnp;
		}
		lastnp = newnp;
		return newnp;
	}
}

/*
 * add pattern p to the tree `curnp', if any.  returns resulting tree.
 * modifies p, which must persist until we are done with this tree.
 */
STATIC NGPAT *
addpat(curnp, p)
register NGPAT *curnp;		/* may be NULL for first pattern */
char *p;
{
	register char *word, *period = p;
	register char c;
	register int level = 0;
	register struct wordcache *wcp = wordcache;
	register int leafmatch = YES;
	int cachematching = YES;

	/* consume negation character if any */
	if (*period == NGNEG) {
		period++;
		leafmatch = NO;
	}

	lastnp = NULL;		/* no parent for first atom */
	for (word = period; (c = *period++) != '\0'; )
		/* inline STRCHR(word, NGDELIM, period); */
		if (c == NGDELIM) {
			/* there appears to be another word in the pattern */
			period[-1] = '\0';	/* NOT restored below */
			curnp = addatom(curnp, word, period - 1 - word,
				level++, wcp++, &cachematching)->n_kids;
			word = period;		/* next atom! */
		}
	/* one last pattern */
	curnp = addatom(curnp, word, period - 1 - word, level, wcp,
		&cachematching);
	if (curnp->n_match != UNKNOWN)
		if (curnp->n_match == leafmatch)
			ngerr = "redundant pattern";
		else
			ngerr = "contradictory pattern";
	curnp->n_match = leafmatch;
	return tree;
}

/*
 * parse ngpat into a tree with no redundant nodes.
 * modifies ngpat, which must persist until we are done
 * with the resulting tree.
 *
 * break the list into NUL-terminated patterns.  add them to the parse tree.
 */
NGPAT *
ngparse(ngpat)
char *ngpat;
{
	register char *p, *comma;
	register char c;
	register NGPAT *pat = NULL;

	ngerr = NULL;
	if (ngpat[0] == '\0')
		ngerr = "empty pattern list";
	cachedepth = 0;
	tree = NULL;
	for (p = comma = ngpat; (c = *comma++) != '\0'; )
		/* inline STRCHR(p, NGSEP, comma); */
		if (c == NGSEP) {
			/* there appears to be another pattern in the list */
			comma[-1] = '\0';	/* NOT restored below */
			if (*p == '\0')
				ngerr = "empty pattern";
			pat = addpat(pat, p);
			p = comma;		/* advance to next pattern */
		}
	/* one last pattern to process */
	return addpat(pat, p);
}


struct prhook {
	char	*group;
	FILE	*stream;
	short	first;
};

STATIC
prnode(np, hook)
register NGPAT *np;
register struct prhook *hook;
{
	register char *newgrp, *period;

	if (hook->group == NULL)
		hook->group = strsave("");
	/* append this word */
	if (hook->group[0] == '\0') {
		free(hook->group);
		hook->group = strsave(np->n_word);
	} else {
		newgrp = str3save(hook->group, SNGDELIM, np->n_word);
		free(hook->group);
		hook->group = newgrp;
	}
	/* check for a message at this node & print if found */
	if (np->n_match != UNKNOWN) {
		if (hook->first)
			hook->first = NO;
		else
			(void) putc(NGSEP, hook->stream);
		if (np->n_match == NO)
			(void) putc(NGNEG, hook->stream);
		(void) fputs(hook->group, hook->stream);
	}
	/* recurse on any children */
	nginprpat(np->n_kids, hook);
	/* remove this word */
	period = strrchr(hook->group, NGDELIM);
	if (period == NULL)
		hook->group[0] = '\0';
	else
		*period = '\0';
}

/* print the pattern tree, internal version */
STATIC
nginprpat(np, hook)
register NGPAT *np;
register struct prhook *hook;
{
	for (; np != NULL; np = np->n_next)
		prnode(np, hook);
}

/* print a pattern tree */
ngprint(pat, stream)
NGPAT *pat;
FILE *stream;
{
	static struct prhook hook;

	hook.first = YES;
	hook.stream = stream;
	nginprpat(pat, &hook);
}


#ifndef STATS
/* ARGSUSED */
ngprstats(stream)
FILE *stream;
{
}
#else				/* STATS */
/* collect tree statistics */
STATIC
ngstatpat(np)
register NGPAT *np;
{
	for (; np != NULL; np = np->n_next) {
		if (np->n_nkids >= MAXKIDS - 1)
			kids[MAXKIDS-1]++;
		else
			kids[np->n_nkids]++;
		if (np->n_kids != NULL)
			ngstatpat(np->n_kids);
	}
}

/* print the tree statistics */
ngprstats(stream)
FILE *stream;
{
	register int n;

	(void) fprintf(stream, "kids\t# nodes\n");
	for (n = 0; n < MAXKIDS; n++)
		(void) fprintf(stream, "%d\t%ld\n", n, kids[n]);
}
#endif				/* STATS */


struct matchook {
	char *suffix;
	int match;
};

#define finalscore(defmatch, realscore, nominalscore) \
	((defmatch) == UNKNOWN? realscore: nominalscore)
#define abs(n) ((n) < 0? -(n): (n))

/*
 * match group (ngpfx.sfx) against the patterns in the tree pat;
 * return a score to indicate depth and kind of match.
 * In particular, the first word (ngpfx) should be found in
 * the pat or its siblings, if any.
 */
STATIC int
matscore(pat, defmatch, ngpfx, ngplen, sfx, realscore, nominalscore, incr)
register NGPAT *pat;			/* pattern subtree */
register int defmatch;			/* inherited match value */
char *ngpfx, *sfx;			/* newsgroup prefix & suffix */
register int ngplen;
register int nominalscore, realscore;
int incr;
{
	register NGPAT *wdnp;
	register int ret;

	if (debug) {
		(void) fprintf(stderr, "matscore(");
		ngprint(pat, stderr);
		(void) fprintf(stderr,
		", defmatch %d, ngpfx %s, sfx %s, scores %d %d, incr %d): ",
			defmatch, ngpfx, sfx, realscore, nominalscore, incr);
	}

	/*
	 * any pattern word matches sfx word? succeeds about 33% of the time.
	 * if no match, use the last "known" score.
	 * if a match, try the next tree level.
	 */
	if (!(pat->n_lenmask&LENBIT(ngplen)) ||
	    (wdnp = treesearch(pat, ngpfx, ngplen)) == NULL) {
		ret = finalscore(defmatch, realscore, nominalscore);
		if (debug)
			(void) fprintf(stderr, "missing; returning %d\n", ret);
		return ret;
	} else {
		if (debug)
			(void) fprintf(stderr,
				"found; continuing with treematch\n");
		defmatch = wdnp->n_match;
		if (nominalscore < 0)
			nominalscore = -nominalscore;	/* absolute value */
		nominalscore += incr;
		if (defmatch == NO)
			nominalscore = -nominalscore;
		if (defmatch != UNKNOWN)
			realscore = nominalscore;
		return treematch(wdnp->n_kids, defmatch, sfx,
			realscore, nominalscore);
	}
}

/*
 * match group suffix ngsfx against the patterns in the tree pat.
 * In particular, the first word of ngsfx should be found in the children of
 * pat, if any.  Alternately, if "all" appears in a child, follow that subtree.
 * Matches with a message of "UNKNOWN" don't contribute to the score,
 * they just allow us to keep matching (we need to note our level in
 * the tree and any discount for wildcard matches, in case we later get
 * a match with a non-"UNKNOWN" message).
 */
STATIC int				/* (depth of match) * sign(match) */
treematch(pat, defmatch, ngsfx, realscore, nominalscore)
register NGPAT *pat;			/* pattern subtree; may be NULL */
register int defmatch;			/* inherited match value */
char *ngsfx;				/* newsgroup suffix */
register int realscore, nominalscore;
{
	register char *period, *sfx;
	register int ret, litscore, wildscore, ngplen;

	if (debug) {
		(void) fprintf(stderr, "treematch(");
		ngprint(pat, stderr);
		(void) fprintf(stderr, ", defmatch %d, %s, scores %d %d): ",
			defmatch, ngsfx, realscore, nominalscore);
	}
	/*
	 * if the pattern or group is exhausted,
	 * defmatch==UNKNOWN is a failure.
	 */
	if (pat == NULL || ngsfx == NULL || ngsfx[0] == '\0') {
		ret = finalscore(defmatch, realscore, nominalscore);
		if (debug)
			(void) fprintf(stderr,
				"returning %d due to exhaustion\n", ret);
		return ret;
	}
	if (debug)
		(void) fprintf(stderr, "...\n");

	STRCHR(ngsfx, NGDELIM, period);
	if (period != NULL) {
		ngplen = period - ngsfx;
		*period = '\0';		/* restored below */
		sfx = period + 1;	/* next words */
	} else {
		ngplen = strlen(ngsfx);
		sfx = "";
	}

	litscore =  matscore(pat, defmatch, ngsfx, ngplen, sfx,
		realscore, nominalscore, 20);
	wildscore = matscore(pat, defmatch, ALL, (int)STRLEN(ALL), sfx,
		realscore, nominalscore, 19);

	if (period != NULL) {
		*period++ = NGDELIM;
		if (period[0] == '\0')
			ngerr = "empty word in group suffix";
	}

	/* return deeper match */
	ret = (abs(wildscore) < abs(litscore)? litscore: wildscore);
	if (debug) {
		(void) fprintf(stderr, "treematch(");
		ngprint(pat, stderr);
		(void) fprintf(stderr,
			", defmatch %d, %s, scores %d %d) returns %d\n",
			defmatch, ngsfx, realscore, nominalscore, ret);
	}
	return ret;
}

/*
 * match groups ngs against the patterns in the tree pat, by walking the tree
 * for each group; first unbanged match wins;
 * return the score (0: no match, <0: banged match, >0: unbanged match,
 * greater magnitude indicates a deeper match).
 */
int
ngpatscore(pat, ngs)
register NGPAT *pat;
register char *ngs;
{
	register char *ngp;			/* point at current group */
	register char *ngcomma;
	register int score = 0;			/* no match yet */

	if (ngs[0] == '\0')
		ngerr = "empty group list";
	for (ngp = ngs; score <= 0 && ngp != NULL; ngp = ngcomma) {
		STRCHR(ngp, NGSEP, ngcomma);
		if (ngcomma != NULL)
			*ngcomma = '\0';	/* will be restored below */

		if (ngp[0] == '\0')
			ngerr = "empty group";
		/* match one group (ngp) against all patterns at once */
		score = treematch(pat, UNKNOWN, ngp, 0, 0);

		if (ngcomma != NULL)
			*ngcomma++ = NGSEP;	/* point after the comma */
	}
	return score;
}

/* match groups ngs against the patterns in the tree pat */
boolean
ngpatmat(pat, ngs)
NGPAT *pat;
char *ngs;
{
	return ngpatscore(pat, ngs) > 0;
}

/* compatibility interface; not a very fast way to match */
boolean
ngmatch(ngpat, ngs)
char *ngpat, *ngs;
{
	register NGPAT *pat;
	register boolean match = NO;
	register char *patcopy = strsave(ngpat);

	if (debug)
		(void) putc('\n', stderr);
	pat = ngparse(patcopy);		/* ngparse destroys patcopy */
	if (pat != NULL) {
#ifdef STATS
		ngstatpat(pat);
#endif
#ifdef NGPRINT
		(void) putc('\n', stderr);
		ngprint(pat, stderr);
		(void) putc('\n', stderr);
#endif
		match = ngpatmat(pat, ngs);
		ngfree(pat);
	}
	free(patcopy);
	return match;
}


/* tear down a pattern tree and free the nodes */
ngfree(np)
register NGPAT *np;
{
	register NGPAT *next = np, *kids;

	while ((np = next) != NULL) {
		next = np->n_next;
		if ((kids = np->n_kids) != NULL)
			ngfree(kids);
		patfree(np);
	}
}