// 																																			KEDS Project
// 																												(Kansas Event Data System)
//__________________________________________________________________________________
// 																																	 	NEXIS_Filter_C
//
//  This program is designed to prepare Nexis downloads for use in KEDS by filtering
//	assorted junk from the download and locating the date and beginning and end of
//	a story. The details of the filtering are described in the documentation; most
//	are located in the function "Filter_Line".
  
//  This version of the filter is written in ANSI C (via MetroWerks C) and designed 
//	for use as a template for other filters.  Virtually all of the Macintosh-specific 
//	code (except for the file selection dialogs) has been eliminated.  The program
//	uses the MetroWerk "SIOUX" output window, which acts like a dumb-terminal plus
//	a menu.  This was the first KEDS project program converted to C and it retains
//  quite a few Pascal-like characteristics: if you think you can see some ways to
//  make it more efficient, you are probably correct. 

// 	Program checks for duplicate stories by comparing the count of the letters in 
//	first 255 characters of the story: if this is less than 24 (SIG_THRESHOLD) the
//	story is skipped.  An optional "filter.headline" file can designate strings that
//  cause a story to be skipped when the string occurs in the NEXIS HEADLINE segment. 

//	Because Nexis downloads are somewhat unpredictable and due to lost phone connections
//	can terminate anywhere, all input operations check for an end-of-file and initiate
//  the exit routine "Do_Exit" if they hit one.  The *normal* place to hit the eof is
//	in the function "Get_Date."

//	HEADER FORMAT
//  The first line of each output record has the format: 
//				YYMMDDbbREUT-AAAA-SS 
//	where
//  b=blank, YY is year, MM is month, DD is day, AAAA is sequence number of article
//  for any given day, and SS is the sequence number of the sentence within the 
//	article.

//	REQUIRED FILES:
//  "filter.abbrev" contains a list of abbreviations that are used to eliminate periods
//	at points other than the end of the sentence.  The format of the file is described
//	in the file itself.  The function "Check_Abbrev" can be used to scan a text file
//	for possible additional abbreviations.

//  OPTIONAL FILES:
//  The program will skip over any story that has a NEXIS "HEADLINE" segment containing 
//  any string found in an optional file named "filter.headline".  This feature provides 
//  for a supplemental level of filtering beyond that done in the original NEXIS download.  
 
//__________________________________________________________________________________
//  																																Revision history
//
//  ca 1992		Various HLEAD filtering programs (Edit_HLEAD, Edit_Reuters, etc)
//						written by Philip Schrodt in Macintosh Think Pascal
//  95.04			Edit_Reuters ported to Turbo Pascal by Philip Huxtable and modified to
//						handle full stories and the KWIC format
//  95.07			Various filter programs handling a number of different formats are  
//						consolidated by Jon Pevehouse in the Think Pascal program Nexis_Filter; 
//						abbreviation checking added
//  96.04			substantially revised and ported to ANSI C (Schrodt)
//  97.04			all-upper case support, HEADLINE filtering and some debugging (Schrodt)
//	97.07			added quote filtering
//	98.01			modified quote filtering to only skip when ." or ," is found

//  Notes on C conversion (96.5.20)
//		The fulltext filter has only been tested with a Reuter 1995 full text download, 
//		though that file seems to use a now-standardized Nexis fulltext format that    
//		covers 1979-1997.  Note that this program is less general in the formats that
//	  it processes than was the Pevehouse "Nexis_Filter" Pascal program.

//	Report bugs to: p-schrodt@ukans.edu

//	The most recent version of this code is available from the KEDS Web site:
//		http://www.ukans.edu/~keds

//	Program development funded by National Science Foundation Grant SBR-9410023 and the
//	University of Kansas General Research Fund Grant 3500-X0-0038.  This program and
//	source code are copyrighted but otherwise in the public domain for non-commercial 
//	purposes.

// Copyright © 1998, University of Kansas  

//__________________________________________________________________________________

#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

//__________________________________________________________
//                                        #define constants

#define 	excep_strings 	"filter.abbrev"			// file containing abbreviation strings
#define 	HEADLINE_strings 	"filter.headline"	// optional file containing HEADLINE stop-strings

#define		SIGNAT_SIZE 	 128							// number of stories saved for duplicate checking
#define		SIG_THRESHOLD 	24							// threshold for declaring a story a duplicate
#define		HEADL_SIZE		1024							// size of aHEADL
#define		HEADL_LIMIT		HEADL_SIZE - 1
#define		ABBREV_SIZE		 255							// size of abbreviations array 'ascr'
#define		ABBREV_LIMIT	ABBREV_SIZE - 5

#define 	MAXLINE 	 			80						// maximum output line length
#define 	LINE_LIMIT		1024						// number of lines to check before giving up in Verify_HEADL and Check_Upper
	
//__________________________________________________________
// Prototypes

void Do_Exit( void );						// exit routine
char Bozo_Filter(void);					// returns a keyed char converted to uppercase, but not forever...
void Initialize_globals( void );// initialization routine

void Init_HEADL( void );				// initialize the HEADLINE stop-strings
void Verify_HEADL(void);				// verifies that file actually has HEADLINE:s
int Check_HEADLINE(void);				// skips stories containing a HEADLINE with a stop-string

int  Check_Upper( void );				// checks whether format is all upper case
int  GetTEXTFile (char filename[]);// Macintosh file-selection procedure
void Get_fileinfo( void );			// select files

void Init_Signat(void);					// routines to check for duplicates				
int  Check_Signature(void);
void Get_date( void ); 					// find line with date and format YYMMDD
void Filter_Line ( void );			// line filtering procedures
void Delete_All( void );
void Read_story( void );				// Full story input
void Read_HLEAD( void );				// HLEAD input

void Write_sentences( void ); 	// Write sentences to file
void Check_Abbrev( void );			// routine for finding potential abbreviations

typedef struct {char stext [16384];} storyType, **storyHand ;	// story text: make it a handle...

//___________________________________________________________________________________
// 																																						Globals

FILE		*fin, *fout;							// input and output files
char		old_date[] = "000000";		// date of previous story
char		date_str[7];							// date returned from "Get_Date"
int			stor_ctr;									// sequence number of story in a day
char		ascr[ABBREV_SIZE];				// abbreviations for eliminating periods
char		fname[32];								// holds output file name
char		s[255];										// text input
storyHand		story;								// story text

int			signat[SIGNAT_SIZE][27];	// story signature vectors used to detect duplicates
int 		cursignat; 								// last cell used to hold a signature
int 		isHLEAD;									// if 0, write full story, otherwise first sentence
int			kduplic, ksent;						// counters for info written per day
char		aHEADL[HEADL_SIZE];				// HEADLINE stop-strings
int			hasHEADL = false;					// do HEADLINE filtering?
int  		fDo_QuoteCheck;						// skips writing sentence if it contains a quote

																				// initialize target strings
const char stEOS[] 			= "UAGE: ";			// end of full story
const char stEOHLead[] 	= ". ";					// end of lead
const char stBOS[] 			= "     LEV";		// beginning of story block
const char stBOTxt[]		= "BODY:";			// beginning of story text
const char stHEADLINE[]	= "HEADLINE:";	// HEADLINE target
const char stPAGE[]			= "Reut";				// Page break target (used in Read_Story)
const char id_string[]	= "REUT";

char 			 month_str[] 		= "JanFebMarAprMayJunJulAugSepOctNovDec";  // used to translate dates


//___________________________________________________________________________________
//																																				main program
void main ()
{
	Initialize_globals();
  Get_fileinfo();
	atexit(Do_Exit);				// schedule "Do_Exit" on the "exit(0)" command

//	Check_Abbrev();				// "uncomment" this line if you wish to just check for abbreviations 

	while (!feof(fin))			// go through the entire file
	{
		Get_date();														// find the start of the next story and get the date

		if (strcmp(date_str,old_date) == 0 )	// see if we've got a new day
			stor_ctr++; 												// same date; increment story counter
		else
		{																			// record the new date
			strcpy(old_date,date_str);
			if (isHLEAD) printf("  %4d  %4d\n",ksent,kduplic );
			else				 printf("  %4d  %4d  %4d\n",stor_ctr, ksent, kduplic );
			printf("DATE %s", date_str);
			stor_ctr = 1;
			Init_Signat();
		}
		
		if (Check_HEADLINE())
		{	if (isHLEAD) Read_HLEAD();						// read the text
			else Read_story();
		
			if (Check_Signature()) Write_sentences ();	// write the text
			else kduplic++;
		}
	}

	exit(0);
}

//________________________________________________________________________________________
// 																						      															Utilities

void Do_Exit( void )
// close the files and write the exit message; this is installed using atexit()
{ 
	fclose(fout);
	fclose(fin);
	printf("\nFINISHED!\nTo exit program, select the Quit option in the menu (Cmd-Q) and then click \"Don't Save\"\n");
	printf("Your filtered text is in the file \"%s\".", fname);
} // Do_Exit

char Bozo_Filter(void)
// returns a keyed char converted to uppercase, but not forever...{
{
	char c, cr;
	static int patience;
	
	c=toupper(getchar());								// get the response
	if (c != '\n') cr=getchar();				// trap the <return>
	patience++;
	if (patience <=5) return(c);
	else 																// user is clueless...
	{ 
		printf("\nPlease seek expert assistance in operating this program...\n\n");	// note the subdued level of sarcasm...
		printf("To exit program, select the Quit option in the menu (Cmd-Q)\nand then click \"Don't Save\"\n");
		exit(0);
	}
} //Bozo_Filter

void Initialize_globals( void )
{
int		kb;
char	c,cr;

	printf("KEDS_Filter: Filter Program for Nexis Downloads\n");
	printf("             Kansas Event Data System (KEDS)\n");
	printf("\n© 1997, University of Kansas\n");
	printf("This program and the KEDS project are supported by: NSF Grant SBR-9410023\n\n");

	story = (storyHand)(NewHandle(sizeof(storyType)));  // Macintosh OS function for allocating heap memory via a handle
	stor_ctr = 0;																				// (otherwise use malloc and a simple pointer)

																	// initialize the abbreviations string
	printf("Reading abbreviations from \"filter.abbrev\"\n");
	fin=fopen(excep_strings,"r");

	if (!fin) 
	{	printf("\nCan't find the file \"filter.abbrev\"; this file is required to run the program\n");
		printf("To exit program, select the Quit option in the menu (Cmd-Q) and then click \"Don't Save\"\n");
		exit(0);
	}
																// read abbreviations into the array 'ascr'
	kb = 0;
	fgets( ascr,8,fin);
	while ((kb <= ABBREV_LIMIT) && (strncmp(&ascr[kb],"#END",4)))
	{	printf("%.3s  ",&ascr[kb]);
		kb+=3; 
		if (!(kb % 42)) printf("\n");		// new line for every 14 abbreviations
		fgets( &ascr[kb],8,fin);
	}
	fclose(fin);
	printf("\n");
	ascr[kb]='\0';										// terminate the array 

	Init_HEADL();							// check for a filter.headline file and initialize

	Init_Signat();						// initialize the signature array

	printf("\n\nDo you wish to write:\n  All sentences in the story (A)\n  Only the lead (first) sentence (L)\n");
	kb=0;
	c='X';
	while ((c!='A') && (c!='L'))
	{
		printf("Please enter A or L and press <return> -->");
		c = Bozo_Filter();															// exit on user-induced infinite loop
	}
	isHLEAD = (c=='L');		// set isHLEAD

	fDo_QuoteCheck = 0;
	if (!(isHLEAD))	     // set fDo_QuoteCheck when coding full stories
	{
		printf("\n\nSkip sentences containing quotes? (Y/N)\n");
		kb=0;
		c='X';
		while ((c!='Y') && (c!='N'))
		{
			printf("Please enter Y (yes) or N (no) and press <return> -->");
			c = Bozo_Filter();
		}
		fDo_QuoteCheck = (c=='Y');	
	}	
} // initialize_globals

//________________________________________________________________________________________
// 																						      					HEADLINE filtering procedures

void Init_HEADL( void )
// looks for filter.headline file; if found reads the strings into the aHEADL array
// Strings are in consecutive locations in the array, separated by \0's.  End of data
// is indicated by a null string 
{
int			ka, kb;

	printf("\nChecking for \"filter.headline\" file\n");
	fin=fopen(HEADLINE_strings,"r");

	if (fin)
	{	kb = 0;
		fgets(s,255,fin);
		while ((kb<HEADL_SIZE) && (strncmp(s,"#END",4)))
		{	printf("%s",s);
			ka=0;
			while ((kb<HEADL_LIMIT) && (s[ka] != '\n')) aHEADL[kb++] = s[ka++];
			if (kb<HEADL_LIMIT) 
				aHEADL[kb++] = '\0';	// terminate the string normally
			else 										// out of room -- terminate array after the previous string
			{ printf("\aOut of room to store stop-strings; last string that was read will not be stored\n");
			  while (aHEADL[--kb]);	// find the end of the previous string
				kb++;
				break; 
			} 
			fgets(s,255,fin);
		}
		aHEADL[kb] = '\0';	// terminate array with a null string
		fclose(fin);
		hasHEADL = true;
	} 
	else	printf("\nFile not present; no HEADLINE filtering will be done\n");
} // Init_HEADL

void Verify_HEADL(void)
// Verifies that HEADLINE:s are found in the file; if not it sets hasHEADL to FALSE
{
int  kb = 0;
	
	do 																// skip lines until HEADLINE)
	{	
		fgets(s,255,fin);
		kb++;
	}
	while ((!feof(fin)) && (strstr(s,stHEADLINE) == 0) && (kb < LINE_LIMIT));

  if (feof(fin) || (kb >= LINE_LIMIT))
  {	printf("\a\aThere are no \"HEADLINE:\" segments in this file\nHeadline filtering has been disabled\n");
		printf("Press <return> to continue\n");
		gets(s);																// just gets the <return>
    hasHEADL=false;
  }											
} // Verify_HEADL

int Check_HEADLINE(void)
// Checks whether there is a stop-string in the HEADLINE: returns TRUE if story
// should be processed (i.e. no stop-string was present) 
{
int  kb = 0;

	if (!hasHEADL) return true;				// not doing filtering
	
	do 																//skip lines until HEADLINE)
		fgets(s,255,fin);
	while ((!feof(fin)) && (strstr(s,stHEADLINE) == 0));

  if (feof(fin)) return false;  
																			// check the strings in aHEADL
  while (aHEADL[kb])
  { if (strstr(s,&aHEADL[kb]) == 0)
  	{	while (aHEADL[++kb]);					// advance kb to next string if no match
  	  kb++;
  	}
  	else break;
  }											
	
	if (aHEADL[kb])										// a match, so skip to this story
	{	do 
			fgets(s,255,fin);
	 	while ((strstr(s,stBOS) != 0) && (!feof(fin)));
	 	//printf("\n>>>Skipping on %s\n",&aHEADL[kb]);
	 	return false;
	 }
	 else return true;
	 
} // Check_HEADLINE

//________________________________________________________________________________________
// 																						      											File procedures

int Check_Upper( void )
// Checks whether the format is all upper-case: goes through the first Reuters story
// looking for a lower-case month abbreviation
{
int			ka, kb;
char 		stReuter[7] = "Reuter";
																								// go to beginning of first Reuters story
	ka = 0;
	do
	{	fgets(s,255,fin);
		ka++;
	}
	while ((!feof(fin)) && (ka < LINE_LIMIT) && (strstr(s,stReuter) == 0)); 

	if (feof(fin) || (ka >= LINE_LIMIT )) // we couldn't find a lower-case "Reuter";
		return true;												// assume file is upper-case
		
	do																		// see if we can find a lower-case month
	{
		ka = 0;
		while ((s[ka]) && (!isalpha(s[ka]))) ka++;	// find first letter in string
		if (s[ka])																	// we've got a non-null string, so check for the month
		{for (kb = 0; kb <= 33; kb+=3 ) 
			{ if (!strncmp(&s[ka],&month_str[kb],3)) return false; }
		}
		fgets(s,80,fin);
		if (feof(fin))	return true;				// file ended before we found a date or LEVEL  
	} while (strstr(s,stBOS) == 0);

	return true;													// no lower case month was found

} //Check_Upper

int GetTEXTFile (char filename[])
// This function gets the file name of an existing file of type 'TEXT'
// using the Macintosh OS procedure "SFGetFile"
{	
SFReply 		reply;
SFTypeList 	typeList;
Point 			where;
int					ka;

	SetPt(&where, 85, 100);                          // Locate the dialog box
	typeList[0] = 'TEXT';
	SFGetFile(where, 0, 0, 1, typeList, 0, &reply); 	// Call SFGetFile

	if (reply.good==TRUE) 														// convert reply.fName to C string 
	{	ka=1;
		while (ka<=reply.fName[0]) 
		{filename[ka-1]=reply.fName[ka];
		 ka++;
		 }
		filename[ka-1]='\0';
 	}
	else
	{	printf("\n\aProgram cancelled!\nTo exit, select the Quit option in the menu (Cmd-Q) and then click \"Don't Save\"\n\a");
	 	exit(0);
	}
	
	return reply.good;                      				// This is true unless CANCEL was clicked
} // GetTEXTFile


void Get_fileinfo( void )
{
int kb;

	printf("\nPress <return>, then select the file you wish to filter");
	gets(fname);							// trap the <return>
	GetTEXTFile (fname);
	fin=fopen(fname,"r");

	printf("\nEnter the name you want to assign to the output file>>> ");
	gets(fname);
	fout=fopen(fname,"w");
	
	if (Check_Upper())						// see if the text is all upper case
	{
		for (kb = 0; kb < 36; kb++)  month_str[kb] = toupper(month_str[kb]); 	// shift months to upper case
		for (kb = 0; ascr[kb]; kb++) ascr[kb] = toupper(ascr[kb]); 						// same for abbreviations
	}

	if (hasHEADL) Verify_HEADL();	// make sure we've got HEADLINE:s in the text

	rewind(fin);									// rewind after these checks 

} //get_fileinfo

void Init_Signat(void)
// initializes the counters in the signat array
{
int	ka;

	for (ka=0; ka<SIGNAT_SIZE; ka++) signat[ka][0]=0;			// initialize the cell
	cursignat=-1;
									// initialize counters
	kduplic = 0;
	ksent   = 0;
} //  Init_Signat

int Check_Signature(void)
// compute a signature (vector giving the count of each letter in the first 255 chars of
// the story and compares to other stories.  Returns TRUE if difference is > SIG_THRESHOLD.
{
int		ka, kb, kc, ktot;
char	ca;

	if (!((**story).stext[0])) return FALSE;				// null story, so dump it

//								 ------ compute current signature -------
	cursignat++;
	if (cursignat >= SIGNAT_SIZE) cursignat=0;			// increment and wrap the index 
	
	for (ka=0; ka<=26; ka++) signat[cursignat][ka]=0;		// zero the vector
	
	ka=0;																// put count of letters into signat[cursignat]
	ca=(**story).stext[0];
	while ((ca) && (ka < 255))
	{ if (isalpha(ca))									// only count letters
		{ ca = toupper(ca);								// shift to upper case if necessary
			signat[cursignat][ca-64]++;
		}
		ca=(**story).stext[++ka];						// get next letter
	}
	signat[cursignat][0]=--ka;					// record number of letters in signature

//								 ------ compare with other stories -------

	if (cursignat==0) return TRUE;				// first entry in the array, so nothing to compare
																				// otherwise check against every other cell
	ktot=255;															// set to high value in case no vector differences are computed
	for (kb=0; kb<SIGNAT_SIZE ; kb++)
		if ((signat[kb][0]) && (kb != cursignat) && (abs(signat[kb][0] - ka) <= SIG_THRESHOLD))	// bypass if counts are different
		{
			ktot=0;
			kc = 1;
			while ((kc<=26) && (ktot<=SIG_THRESHOLD))							// compute vector difference
			{ ktot += abs(signat[kb][kc]-signat[cursignat][kc]);
				kc++;
			}
			if (ktot<=SIG_THRESHOLD) kb=SIGNAT_SIZE;
		}
	
	if (ktot>SIG_THRESHOLD)
		return TRUE;
	else
		return FALSE;
		
} //  Check_Signature

void Get_date( void )
// find line with date and format this to YYMMDD
{
int			ka, kb;
int			gotdate, mon_loc;
																								// go to beginning of next story
	do
		fgets(s,255,fin);
	while ((!feof(fin)) && (strstr(s,stBOS) == 0)); 

	if (feof(fin))	exit(0);	// we're finished, so exit...  
														// In a Nexis file downloaded without problems, this is where the
														// program will terminate.

	gotdate = false;
	do												// read lines until we find one that starts with a month
	{	fgets(s,80,fin);
		if (feof(fin))	exit(0);										// file ended before we got a date...  
		ka = 0;
		while ((s[ka]) && (!isalpha(s[ka]))) ka++;	// find first letter in string
		if (s[ka])																	// we've got a non-null string, so check for the month
		{for (kb = 0; kb <= 33; kb+=3 ) 
			{ if (!strncmp(&s[ka],&month_str[kb],3))
					{	gotdate = true;
						mon_loc = kb;
						break;
					}
			}
		}
	} while (!gotdate); 

	while (!isdigit(s[ka])) ka++;										// find the numerical day
	strncpy(date_str,strstr(&s[ka+2],"19")+2, 2);   // set date_str to the year; &[ka+2] starts the search after the day
																									// change this to handle dates after 00.01.01
																									
	kb= (mon_loc / 3) + 1;													// get the ordinal month from mon_loc
	sprintf(&date_str[2],"%02i",kb);								// add the month to the date string

	if (s[ka + 1] == ',' )													// add the day; this is done after adding month to 
	{	date_str[4] = '0';														// overwrite the \0 inserted by sprintf
		date_str[5] = s[ka];}
	else
	{	date_str[4] = s[ka];													// two-digit day here...
		date_str[5] = s[ka + 1];}

	date_str[6] = '\0';															// terminate the string 

} //get_date


//______________________________________________________________________________
// 																										Line filtering procedures

void Filter_Line ( void )
// Filters s for non-text chars and checks "<abbreviation>. " conditions
{
int 		ka,kw;
char		sc[3];

	ka=0;
	while ((s[ka]) && (s[ka]<=' ')) s[ka++]= '\x01';		// remove leading blanks and control chars
	
	if (!s[ka])				// end of string; we're done
	{ s[0] = '\0';		// set to a null string 
	  return;
	}		
	for (ka = 0; s[ka]; ka++ )  //remove nontext characters by setting to '\x01'
	{
		if (s[ka] == '\n' ) 			//convert \n to a blank
			s[ka] = ' '; 	
		else if ((s[ka] < ' ' ) || (s[ka] > '~' )) 	//remove all control characters and characters >=127
			s[ka] = '\x01'; 
		else if ((s[ka] == '>' ) || (s[ka] == '<' ))  	//remove Nexis highlight markers < and >
			s[ka] = '\x01';
		else if ((s[ka] == '?' ) || (s[ka] == '!' ))	  // change ?, ! to . 
			s[ka] = '.';
		else if ((s[ka] == '\'' ) && (s[ka+1] == '\'' )) // change double apostrophes to quotes 
			{ 
				if (s[ka-1] == '.')  // '' occurs at end of sentence
				{
					s[ka+1]   = ' ';
				  s[ka]     = '"';
				}
				else 
				{
					s[ka]   = ' ';
				  s[ka+1] = '"';
				}
			}

	 	if (s[ka] == '"' )
	 	{
	 	  if (fDo_QuoteCheck)
		 	{
		 	 	if (s[ka-1] == '.' ) 	// flip the order to correctly locate the end of sentences
				{
					s[ka-1] = '"';
					s[ka]   = '.';
				} 
			}
	  	else										// remove all double-quotes
				s[ka] = '\x01';
		}

		if ((s[ka] == ' ' ) && (s[ka+1] == ' ' ))	// remove consecutive blanks
				s[ka] = '\x01';
	}
	Delete_All();							// get rid of everything marked

	if (strlen(s)<=2) return;		// string is too short to contain abbreviations, so we're done
	
	for (ka = 2; s[ka]; ka++ )  // deal with the periods
	{
		if ((s[ka] == '.' ) && (s[ka+1] == ' ' ))  //process periods followed by spaces that aren't an end-of-sentence 
		{ 
			if (isupper(s[ka - 1])) 
			{ if ((s[ka - 2] == ' ' ) || (s[ka - 2] == '.' )) // middle initials and U.S., U.N., O.J., etc.
					s[ka] = '\x01';
			}
																		// check whether we've got an abbreviation in the filter.abbrev list           
				if (ka>2)										// set the target string 
					strncpy(sc,&s[ka - 3], 3);
				else
				{	strcpy(sc,"   ");	
					strncpy(&sc[3-ka],s, 3); }
																		// look for it in ascr
				kw = 0;
				while ((ascr[kw]) && (strncmp(sc,&ascr[kw],3)))	kw+=3;
				if (ascr[kw]) s[ka] = '\x01'; 			//we've got one, so remove the period  
					
		}	// if
	} 	// for
	Delete_All();		// get rid of everything marked

}	// Filter_Line 

void Delete_All( void )
// go through s and eliminate everything that was tagged with \x01
{
int	 ka		= 0;
int	 koff = 0;

	while (s[ka])
	{
		if (s[ka] != '\x01') s[koff++]=s[ka];
		ka++;
	}
	s[koff]='\0';
} // Delete_All


//___________________________________________________________________________
// 																														HLEAD procedure

void Read_HLEAD( void )
// Reads the first sentence of a story into (**story).stext, terminating when
// stEOHLead= ". " (period-space) is found after abbreviation filtering.
{
int		ka;
int		kr= 0;  						//	kr counts the chars in story
int		isEOH = FALSE;

	do 										//skip lines until beginning of text
		fgets(s,255,fin);
	while ((!feof(fin)) && (strstr(s,stBOTxt) == 0));
	
	if (feof(fin)) exit(0);			// this is not the normal place to exit, but allow for it anyway...
	
	fgets(s,255,fin);						// go to next line after stBOTxt
																					 //read lines until end-of-HLEAD is found
  while ((!isEOH) && (!feof(fin)) )
 	{	
		Filter_Line();

		if (s[0])																							// skip if s is null string
		{	for (ka=0; s[ka]; ka++) (**story).stext[kr++] = s[ka];  // xfer s into story 
			if (s[ka-1] != ' ') (**story).stext[kr++] = ' ';			// end line with a blank

			isEOH = (int) strstr(s,stEOHLead);
			if (kr>1024) isEOH = TRUE;														// just in case...  this could occur if download was noisy
		}

	  if (!isEOH) fgets(s,255,fin);
	 }	// while

	kr--;
	while ((kr) && ((**story).stext[kr] != '.')) kr--;	// find the last period in the story
	if (kr)
	{	(**story).stext[++kr] = ' ';							// add a blank... 
		(**story).stext[++kr] = '\0';							// terminate story after the period 
	}
	else (**story).stext[0] = '\0';							// null story

}	// Read_HLEAD

//____________________________________________________________________________
// 																												Full story procedure

void Read_story( void )
// Reads the full text of a story into (**story).stext, 
// terminating when a line containing stEOS is found
{
int		ka;
int		kr= 0;  //	kr counts the chars in story

	do 										//skip lines until beginning of text)
		fgets(s,255,fin);
	while ((!feof(fin)) && (strstr(s,stBOTxt) == 0));
	
	if (feof(fin)) exit(0);			// this is not the normal place to exit, but allow for it anyway...
	
	fgets(s,255,fin);						// go to next line
																									   //get all of the lines in the story
  while ((strstr(s,stEOS) == 0) && (!feof(fin)) )
 	{	
		if (s[2]=='>') 							// skip over page break on >>>
		{	do fgets(s,255,fin); while (!strstr(s,stPAGE));
		  fgets(s,255,fin);
		}
		Filter_Line();

		if (s[0])
		{
			for (ka=0;s[ka];ka++) (**story).stext[kr++] = s[ka];  // xfer s into story
			if (s[ka-1] != ' ') 
			   (**story).stext[kr++] = ' ';			// end line with a blank
		}

		if (kr <= 16200 )  
			fgets(s,255,fin);
		else 		 					 //if max chars in story is reached, skip lines until next story
			{	do 
					fgets(s,255,fin);
			 	while ( ( strstr(s,stBOS) != 0) && (!feof(fin)) );
			 }

	}	// while

	while ((kr) && ((**story).stext[kr] != '.')) kr--;	// find the last period in the story
	if (kr)
	{	(**story).stext[++kr] = ' ';							// add a blank... 
		(**story).stext[++kr] = '\0';							// terminate story after the period 
	}
	else (**story).stext[0] = '\0';							// null story; this will be eliminated by Check_Signat
	
}	// Read_story

//____________________________________________________________________________
// 																														Output procedure

void Write_sentences( void ) 
// Write sentences to the output file
{
int			ka, kb, kc;
int			kline;								// tracks number of chars written to a line
char		rec_id[3] = "01";			// serial number of sentence
char		stor_id[5]; 					// sequence of the story within a day
char 		ca;
int			fquote_okay = 1;

#if FALSE											// debugging code
char		sdebug[32] = "A senior Iranian";			// start of problem sentence

	ka=0;												// check for the debug string 
	while ((sdebug[ka]) && ((**story).stext[ka] == sdebug[ka])) ka++;
	if (!(sdebug[ka]))
	{ ka=0;											// dummy statements
		kb=0;
	}
#endif

	kb=0;
	while (!isalnum((**story).stext[kb])) kb++; // find first nonblank char 
	
	if (!(**story).stext[kb]) return;								// nothing there...
	
	while ((**story).stext[kb])											// go through the entire story
	{ 

		if (fDo_QuoteCheck)
		{
			kc = kb-1;
			do 
			{
				do 
					kc++;
				while (	((**story).stext[kc]) 				&& 
				        ((**story).stext[kc] != '"') && 
				        (((**story).stext[kc] != '.') || ((**story).stext[kc+1] != ' '))); // scan the sentence

				if (((**story).stext[kc] == '"') && 			// check the double-quote conditions; if ". or ," 
				    (((**story).stext[kc+1] == '.') || ((**story).stext[kc-1] == ',')))	// then skip sentence
				{ while (	((**story).stext[kc]) 			 &&
									((**story).stext[kc] != '.') || ((**story).stext[kc+1] != ' ')) 
						kc++; 												// go to the end of the sentence
					kb = kc+1;											// reset kb
					fquote_okay = 0;								// set flag to skip writing
				}
			}
			while ((fquote_okay) && 
						 (((**story).stext[kc] != '.') || ((**story).stext[kc+1] != ' ')) &&
						 ((**story).stext[kc]) );
		}			 	
		if (fquote_okay)
		{
			sprintf(stor_id,"%04i",stor_ctr);							// convert the story sequence to a string
			fprintf(fout, "%s  %s-%s-%s\n",date_str, id_string, stor_id, rec_id); // write the header
			ksent++;

			kline=0;
			while ((((**story).stext[kb] == '.') && ((**story).stext[kb+1] == ' ')) == 0) // write a sentence
			{
				ka = kb;																		// ka = start of word; kb = end of word
				while  (((**story).stext[kb] != ' ') && ((**story).stext[kb] != '.'	)) kb++; // get end of word 

				if ((kline + kb - ka) > MAXLINE ) 
				{ fprintf(fout,"\n");							// start new line
					kline = 0;}

				for (kc = ka; kc <= kb; kc++ ) 		// write the word
				{ ca = (**story).stext[kc];
					putc(ca,fout);									// for some reason, putc((**story).stext[kc],fout) doesn't work in MetroWerks, so this construction is used instead
				}
				kline += (kb-ka+1);								// increment kline with the word length 
				if (((**story).stext[kb] != '.') || ((**story).stext[kb+1] != ' ')) kb++; // skip to next word unless this is the end of the sentence
			}
			kb++;
			fprintf(fout,"\n\n");
		}
		else fquote_okay = 1;
		
		while (((**story).stext[kb]) && ((**story).stext[kb] == ' ') ) kb++; // skip leading blanks 
									
		rec_id[1]++;																// increment rec_id
		if (rec_id[1]>'9')
		{ rec_id[1]='0';
		  rec_id[0]++;
		  if (rec_id[0] == ':') rec_id[0] = 'A';   // number sentences using 8*, 9*, A*, B*...
		}
	}
}	// Write_sentences

		
//_________________________________________________________________________
// 																										Check_Abbrev procedure

void Check_Abbrev( void )
// This procedure reads through a file and writes abbreviation candidates
// to RWTest.Out.  Abbreviation candidate is the pattern
//	 "b [uc] * {*} {* }. b"
//	where
//				b 	= blank
//			 [uc] = upper case letter
//				*		=	any character  {*}: optional
// The pattern is ignored if it occurs at the line of a line.  The text is
// filtered -- including the elimination of abbreviations from the 
// 'filter.abbrev' file -- before processing.
// The function and program terminate when the eof is hit.

{
int		ka, kb, isAbbrev;
char	*sc;

	fgets(s,255,fin);
  while (!feof(fin)) 				// main loop: reads the entire file
 	{	
		Filter_Line();					// apply the filter

		if (!s[0]) 							// null line after filtering; so just go to the next one
		{	fgets(s,255,fin);
			continue;					}

		sc=strstr(s,stEOHLead);		// check for ". " pattern
		if ((sc) && (sc[2]))			// pattern is present and doesn't occur at the end of a line
		{	ka=sc-s;								// get index of first occurence
		 	isAbbrev=FALSE;
			while ((!isAbbrev) && (s[ka]))			// check the remainder of the string
		 	{ 
	  		if ((s[ka]=='.') && (s[ka+1]==' '))
				{ for (kb=2;kb<=4;kb++) 																			// check for 2,3,4 char abbrevs
				  {	if ((ka==kb) && (isupper(s[0]))) isAbbrev=TRUE;						// abbrevs at start of line
				  	if ((ka>kb) && (isupper(s[ka-kb])) && (s[ka-kb-1]==' ')) 	// abbrevs in mid-line
				  			isAbbrev=TRUE;
				  }
				}
				ka++;
			}

			if (isAbbrev)							// write the string 
			{	fprintf(fout,"%s\n",s);
				printf("\n%s\n",s);
			}
		}
		fgets(s,255,fin);						// get the next line
	}	// while

	exit(0);											// exit when we are finished...

}	// Check_Abbrev



		
