To: leafnode-list@wpxx02.toxi.uni-wuerzburg.de Subject: [PATCH] Scoring capabilities within the filter file. From: Lloyd Zusman Date: 22 Mar 1999 01:54:37 -0500 Organization: Linux Hippopotamus Preserve Reply-To: leafnode-list@wpxx02.toxi.uni-wuerzburg.de User-Agent: Gnus/5.07008 (Pterodactyl Gnus v0.80) XEmacs/21.2(beta4) (Aglaophonos) As described in my previous message, here is a patch to leafnode-1.9.2 which adds a simple scoring capability to the filter file processing. There's some added syntax in the filter file which, if absent, will cause leafnode's filtering capabilities to work the same as the non-scoring version. The details of how this preliminary scoring works is described in my other message: Subject: [ANNOUNCE] Scoring capabilities within the filter file. Here is the patch for leafnode-1.9.2 ... please let me know how this works for you. Lloyd Zusman ljz@asfast.com -------------------------------- cut here -------------------------------- *** applyfilter.c.orig Sat Mar 20 23:05:15 1999 --- applyfilter.c Mon Mar 22 04:07:20 1999 *************** *** 64,70 **** int main( int argc, char * argv[] ) { char c[4] = "-\\|/" ; ! int i, n, error, option; char *k, *l; FILE *f; DIR * d; --- 64,70 ---- int main( int argc, char * argv[] ) { char c[4] = "-\\|/" ; ! int i, n, score, option; char *k, *l; FILE *f; DIR * d; *************** *** 143,151 **** fread( l, sizeof(char), st.st_size, f ); if ( ( k = strstr( l, "\n\n" ) ) != NULL ) *k = '\0'; /* cut off body */ ! error = dofilter( l ); fclose( f ); ! if ( error ) { unlink( de->d_name ); if ( verbose ) printf( "%s deleted\n", de->d_name ); --- 143,151 ---- fread( l, sizeof(char), st.st_size, f ); if ( ( k = strstr( l, "\n\n" ) ) != NULL ) *k = '\0'; /* cut off body */ ! score = dofilter( &l, 1 ); fclose( f ); ! if ( score < 0 ) { unlink( de->d_name ); if ( verbose ) printf( "%s deleted\n", de->d_name ); *** fetch.c.orig Sat Mar 20 22:47:13 1999 --- fetch.c Mon Mar 22 04:08:29 1999 *************** *** 814,834 **** } } /* regexp pattern matching */ ! if ( filter ) { ! int error; ! h = 0; ! error = 0; ! while ( ( h < 10 ) && !error ) { ! error = dofilter( hd[h] ); ! if ( error ) { ! killed++; ! syslog( LOG_INFO, "Killed article %d %s", stufftoget[i], ! hd[4] ); ! } ! h++; } - if ( error ) - continue; } /* store articles */ --- 814,827 ---- } } /* regexp pattern matching */ ! if ( fgroup ) { ! int score = dofilter( hd, 10 ); ! if ( score < 0 ) { ! killed++; ! syslog( LOG_INFO, "Killed article %d %s", stufftoget[i], ! hd[4] ); ! continue; } } /* store articles */ *** filterutil.c.orig Sat Mar 20 21:26:20 1999 --- filterutil.c Mon Mar 22 06:43:09 1999 *************** *** 5,10 **** --- 5,92 ---- Copyright 1998. See file COPYING for restrictions on the use of this software. + + ----------------------------------------------------------------------------- + + By Lloyd Zusman , 1999/03/22: + + I have added the capability to recognize and process scoring + lines within the filter file (the syntax and usage of these scoring + lines is described below). This is backwards compatible: if no + such scoring lines exist, then the filter file will be interpreted + exactly as it is in previous leafnode versions. + + However, if one or more scoring lines appear within the filter file, + they will be interpreted as follows: + + [score:N] + [score:=N] + [score::N] + [score::=N] + normal regexp entries + ... + ... etc. ... + ... + + ... where N is a positive or negative integer. For any regexp entries + that match, this either adds N to the current score (in the cases without + the equal sign), or else (in the cases where the equal sign is used) it + sets the current score to N. + + In the case of a single colon, a matched regexp will cause the rest + of the regexp's in the current scoring block to be ignored, and + for the next scoring block to be tested. In the case of a double + colon, once a regexp matches, no more regexps will be tested and the + current score is what will be associated with the article being tested. + + Whenever a new scoring entry is encountered, its rules supersede + any previous rules. + + Any article whose score is >= 0 will be accepted, and any article with + a score < 0 will be rejected. Each article's score is set to 0 by + default before scoring begins. + + If no scoring line precedes any regular expressions, we operate + as if [score:=-1] was issued. This is equivalent to the earlier + filter file paradigm, and this is what makes this algorithm backwards + compatible. + + For example, consider the following hypothetical filter file: + + [score:: =1] + ^Newsgroup:.*alt.hippo.potamus + + [score: -10] + . + + [score: +6] + ^Subject:.*food + [score: +6] + ^Subject:.*hippopotamus + [score: +6] + ^Subject:.*telekenesis + + The first section sets any article posted to alt.hippo.potamus to be + scored with 1 point and to be processed no further. This guarantees + that we will always download all articles posted to this newsgroup, + since 1 is greater than or equal to zero. + + The second section causes all remaining articles to have -10 added to + their score. Since these articles all start out with a default score + of 0, this causes all of them to have a -10 score. + + Because we didn't use the '::' form of scoring line here, all articles + will then be processed further, and they are then handled within the + 3rd-5th sections. Here, 6 points are added to the score for + each of the following words appearing in the subject: "food", + "hippopotamus", and "telekenesis". This has the result of causing us + to download all articles who have at least two of these words in their + subjects, since only in this case would the score get incremented to a + value greater than or equal to zero. + + This is a simple and rather primitive scoring algorithm, but it's + powerful enough to handle many useful cases. + */ #include "leafnode.h" *************** *** 16,22 **** #include #include ! struct filterlist * filter; /* * read filters into memory. Filters are just plain regexp's --- 98,197 ---- #include #include ! typedef struct { ! int score; ! unsigned int flags; ! } parseresults; ! ! static struct filterlist * filter; ! struct fgrouplist * fgroup = NULL; ! ! /* ! * parse score lines ! */ ! static int parsescore( char * l, parseresults * results ) { ! ! int score = 0; ! unsigned int flags = 0; ! char * s; ! ! if ( l == NULL || results == NULL) ! return (0); ! while ( *l == ' ' || *l == '\t' ) ! l++; ! if ( *l++ != '[' ) ! return (0); ! while ( *l == ' ' || *l == '\t' ) ! l++; ! if ( strncasecmp( l, "score", 5 ) != 0 ) ! return (0); ! l += 5; ! while ( *l == ' ' || *l == '\t' ) ! l++; ! if ( *l++ != ':' ) ! return (0); ! if ( *l == ':' ) { ! l++; ! flags |= FILTERLIST_NO_CONTINUE; ! } ! while ( *l == ' ' || *l == '\t' ) ! l++; ! s = l; ! while ( *s != '\0' && *s != ']' ) ! s++; ! if ( *s != ']' ) ! return (0); ! if ( *l == '=' ) { ! flags |= FILTERLIST_SET_SCORE; ! l++; ! } ! if ( s > l ) { ! char * dummy = NULL; ! long result; ! char oldch = *s; ! *s = '\0'; ! result = strtol( l, &dummy, 10 ); ! *s = oldch; ! if ( dummy == NULL || dummy == l ) ! return (0); ! score = (int) result; ! } ! ! results->score = score; ! results->flags = flags; ! ! return (1); ! } ! ! /* ! * free the specified filterlist ! */ ! static void freefilterlist( struct filterlist * flist ) { ! struct filterlist * next; ! while ( flist != NULL ) { ! next = flist->next; ! if ( flist->expr != NULL ) ! free( flist->expr ); ! free( flist ); ! flist = next; ! } ! } ! ! /* ! * free the filter group list ! */ ! static void freefgrouplist(void) { ! struct fgrouplist * fg = fgroup; ! struct fgrouplist * next; ! while ( fg != NULL ) { ! next = fg->next; ! if ( fg->flist != NULL ) ! freefilterlist( fg->flist ); ! free( fg ); ! fg = next; ! } ! fgroup = NULL; ! } /* * read filters into memory. Filters are just plain regexp's *************** *** 25,33 **** --- 200,212 ---- FILE * ff; char * l; struct filterlist * f, * oldf ; + struct fgrouplist * fg, * oldfg; + parseresults parseinfo ; if ( filterfile == NULL || !strlen(filterfile) ) return; + if ( fgroup != NULL ) + freefgrouplist(); filter = NULL; ff = fopen( filterfile, "r" ); if ( !ff ) { *************** *** 37,45 **** --- 216,245 ---- } oldf = NULL; debug = 0; + fgroup = (struct fgrouplist*)critmalloc( sizeof(struct fgrouplist), + "Allocating fgrouplist space" ); + fgroup->next = NULL; + fgroup->score = -1; + fgroup->flags = FILTERLIST_SET_SCORE; + fgroup->flist = NULL; + oldfg = fgroup; while ( ( l = getaline( ff ) ) != NULL ) { if ( *l == '#' || *l == '\0' ) continue ; + if (parsescore(l, &parseinfo)) { + oldfg->flist = filter; + filter = NULL; + oldf = NULL; + fg = (struct fgrouplist*)critmalloc( sizeof(struct fgrouplist), + "Allocating fgrouplist space " ); + fg->next = NULL; + fg->score = parseinfo.score; + fg->flags = parseinfo.flags; + fg->flist = NULL; + oldfg->next = fg; + oldfg = fg; + continue; + } f = (struct filterlist *)critmalloc( sizeof(struct filterlist), "Allocating filterlist space" ); f->expr = ( regex_t *) critmalloc( sizeof( regex_t ), *************** *** 49,55 **** syslog( LOG_ERR, "Invalid filter pattern %s", l ); printf( "Invalid filter pattern %s", l ); free( f ); ! } else { if ( !filter ) filter = f; else --- 249,256 ---- syslog( LOG_ERR, "Invalid filter pattern %s", l ); printf( "Invalid filter pattern %s", l ); free( f ); ! } ! else { if ( !filter ) filter = f; else *************** *** 57,82 **** oldf = f; } } debug = debugmode; fclose( ff ); } /* * read and filter headers. ! * Return true if headers are correct, false otherwise */ ! int dofilter( char *h ) { struct filterlist * f ; ! int match ; ! f = filter; ! match = REG_NOMATCH ; ! while ( f && match == REG_NOMATCH ) { ! match = regexec( f->expr, h, 0, NULL, 0 ); ! f = f->next ; ! } ! if ( match == 0 ) ! return TRUE; ! else ! return FALSE; } --- 258,303 ---- oldf = f; } } + if ( filter != NULL ) + oldfg->flist = filter; debug = debugmode; fclose( ff ); } /* * read and filter headers. ! * Return the score. */ ! int dofilter( char **headerList, int nHeaders ) { struct filterlist * f ; ! struct fgrouplist * fg ; ! char ** h; ! int n; ! int floop = 1; ! int fgloop = 1; ! int score = 0; ! if ( headerList == NULL || nHeaders < 1 ) ! return (0); ! fg = fgroup; ! while ( fgloop && fg ) { ! f = fg->flist; ! while ( floop && f ) { ! for ( h = headerList, n = nHeaders; h != NULL && n-- > 0; h++ ) { ! if ( regexec( f->expr, *h, 0, NULL, 0 ) != REG_NOMATCH ) { ! if ( (fg->flags & FILTERLIST_SET_SCORE) != 0 ) ! score = fg->score; ! else ! score += fg->score; ! if ( (fg->flags & FILTERLIST_NO_CONTINUE) != 0 ) ! fgloop = 0; ! floop = 0; ! break; ! } ! } ! f = f->next ; ! } ! fg = fg->next; ! } ! return (score); } *** leafnode.h.orig Sat Mar 20 21:38:30 1999 --- leafnode.h Mon Mar 22 04:10:30 1999 *************** *** 115,125 **** */ struct filterlist { struct filterlist * next; regex_t * expr; }; ! extern struct filterlist * filter; void readfilter( char *filterfile ) ; ! int dofilter( char * h ) ; /* * artutil -- handling article files --- 115,139 ---- */ struct filterlist { struct filterlist * next; + int score; + unsigned int flags; regex_t * expr; }; ! struct fgrouplist { ! struct fgrouplist * next; ! int score; ! unsigned int flags; ! struct filterlist * flist; ! }; ! extern struct fgrouplist * fgroup; void readfilter( char *filterfile ) ; ! int dofilter( char ** headerList, int nHeaders ); ! ! /* ! * flags for the filterlist structure ! */ ! #define FILTERLIST_SET_SCORE (1<<0) ! #define FILTERLIST_NO_CONTINUE (1<<1) /* * artutil -- handling article files