/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**-------------------------------------------------
** Added support for METADATA
** G. Hill  ghill@library.berkeley.edu   3/18/97
**
** Added Document Properties support
** Mark Gaulin gaulin@designinfo.com  11/24/98
**
** Added safestrcpy() macro to avoid corruption from strcpy overflow
** SRE 11/17/99
**
** Added Document Filter support (e.g. PDF, Winword)
** Rainer.Scherg@t-online.de   (rasc)  1998-08-07, 1999-05-05, 1999-05-28
**
** Added some definitions for phrase search
** Structure location modified to add frequency and word positions
** Structure entry modified to add link hash values for direct search
**
** Jose Ruiz jmruiz@boe.es 04/04/00
*/

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <locale.h>
#include <ctype.h>
#include "config.h"

#ifdef NEXTSTEP
#include <sys/dir.h>
#else

#ifdef _WIN32
#include "win32/dirent.h"
#include "Win32/regex.h"
#define snprintf _snprintf
#define pclose _pclose
#define popen _popen
#define vsnprintf _vsnprintf
#define strcasecmp stricmp
#else    
#include <dirent.h>
#include <regex.h>
#endif

#endif

#include <ctype.h>
#include <stdlib.h>
#include <time.h>
#include <setjmp.h>

#define VERSION "2.0"
#define INDEXHEADER "# SWISH format 2.0"
#define INDEXVERSION "# Swish-e format 2.0"
#define INDEXFILE "index.swish-e"

#define NAMEHEADER "# Name:"
#define SAVEDASHEADER "# Saved as:"
#define COUNTSHEADER "# Counts:"
#define INDEXEDONHEADER "# Indexed on:"
#define DESCRIPTIONHEADER "# Description:"
#define POINTERHEADER "# Pointer:"
#define MAINTAINEDBYHEADER "# Maintained by:"
#define WORDCHARSHEADER "# WordCharacters:"
#define MINWORDLIMHEADER "# MinWordLimit:"
#define MAXWORDLIMHEADER "# MaxWordLimit:"
#define BEGINCHARSHEADER "# BeginCharacters:"
#define ENDCHARSHEADER "# EndCharacters:"
#define IGNOREFIRSTCHARHEADER "# IgnoreFirstChar:"
#define IGNORELASTCHARHEADER "# IgnoreLastChar:"

#define STEMMINGHEADER	"# Stemming Applied:"
#define SOUNDEXHEADER "# Soundex Applied:"

#define DOCPROPHEADER "# DocProperty"
#define DOCPROPENHEADER "# DocumentProperties:"
#define SORTDOCPROPHEADER "# SortDocProperty"

#define MAXFILELEN 1000
#define MAXSTRLEN 2000
#define MAXWORDLEN 1000
#define MAXTITLELEN 200
#define MAXSUFFIXLEN 10
#define MAXENTLEN 10
#define HASHSIZE 101
#define BIGHASHSIZE 1009
#define SEARCHHASHSIZE 10001
#define MAXPAR 10
#define MAXCHARDEFINED 200

#define TI_OPEN 1
#define TI_CLOSE 2
#define TI_FOUND 4
#define NOWORD "thisisnotaword"
#define SECSPERMIN 60

#define NO_RULE 0
#define AND_RULE 1
#define OR_RULE 2
#define NOT_RULE 3
#define PHRASE_RULE 4
#define AND_NOT_RULE 5

#define IN_FILE 1
#define IN_TITLE 2
#define IN_HEAD 4
#define IN_BODY 8
#define IN_COMMENTS 16
#define IN_HEADER 32
#define IN_EMPHASIZED 64
#define IN_ALL 127

#define MAXLONGLEN 4
#define MAXCHARS 266    /* 255 for chars plus ten more for other data */
#define METANAMEPOS MAXCHARS - 4
#define STOPWORDPOS MAXCHARS - 3
#define FILELISTPOS MAXCHARS - 2
#define FILEOFFSETPOS MAXCHARS - 1

#define MAX_PROPS_TO_DISPLAY 50
#define MAX_PROPS_TO_SORT 50

/*
 * This structure defines all of the functions that need to
 * be implemented to an Indexing Data Source.
 * Right now there are two Indexing Data Source types:
 *  file-system based and an HTTP web crawler.
 * Any Data Source can be created as long as all of the
 * functions below are properly initialized.
 */
struct _indexing_data_source_def
{
  const char* IndexingDataSourceName;           /* long name for data source */
  const char* IndexingDataSourceId;             /* short name for data source */
  void (*indexpath_fn)(char *path);		/* routine to index a "path" */
  int (*vgetc_fn)(void *vp);			/* get char from "file" */
  int (*vsize_fn)(void *vp);			/* get size of "file" */
  int (*vtell_fn)(void *vp);			/* get position in "file" */
  int (*vseek_fn)(void *vp,long pos);		/* set position in "file" */
  int (*parseconfline_fn)(char *line);		/* parse config file lines */
};

struct docPropertyEntry 
{
	int metaName;		/* meta field identifier; from getMetaName() */
	char *propValue;	/* string from META's CONTENTS attribute */

	struct docPropertyEntry *next;
};

struct metaEntry {
	char* metaName;
	int index;
	
	/* is this meta field a Document Property? */
	char isDocProperty;		/* true is doc property */
	char isOnlyDocProperty;	/* true if NOT an indexable meta tag (ie: not in MetaNames) */
	
	struct metaEntry* next;
};

struct result {
	int filenum;
	int rank;
	int structure;
	int frequency;
	int *position;
	struct result *next;
	struct result *nextsort;   /* Used while ssorting results */
	char *fileInfo;
	/* file position where this document's properties are stored */
	long propPos;
	char *prop[MAX_PROPS_TO_DISPLAY];
	char *propSort[MAX_PROPS_TO_SORT];
};

struct file {
	char *filename;
	char *title;
	int size;
	struct docPropertyEntry* docProperties;
	struct file *next;
};

struct filenum {
	int fileshort;
	long filelong;
	struct filenum *next;
};

struct location {
	int filenum;
	int rank;
	int structure;
	int metaName;
	int frequency;
	int emphasized;
	int max_positions;
	int *position;
	struct location *next;
};

struct entry {
	char *word;
	int tfrequency;
	struct location *locationlist;
	struct entry *nexthash;
	long fileoffset;
	int currentfilenum;
	struct location *currentlocation;
};

struct docentry {
        char *filename;
        char *title;
};


struct swline {
	char *line;
	struct swline *next;
};

struct fwordtotal {
	int filenum;
	int totalwords;
	struct fwordtotal *next;
};

struct filter {
        char *suffix;
        char *prog;
        struct filter *next;
};

struct entryarray {
	int currentsize;
	int maxsize;
	struct entry **elist;
};


struct docentryarray {
        int currentsize;
        int maxsize;
        struct docentry **dlist;
};

#ifndef MAIN_FILE
#define VAR extern
#else
#define VAR
#endif

VAR struct _indexing_data_source_def *IndexingDataSource;

VAR struct file *filelist;
VAR struct entryarray *entrylist;
VAR struct entry *hashentries[SEARCHHASHSIZE];

VAR struct swline *replacelist;
VAR struct swline *searchwordlist;
VAR struct swline *nocontentslist;
VAR struct swline *dirlist;
VAR struct swline *indexlist;
VAR struct swline *hashstoplist[HASHSIZE];
VAR char **stopList;

VAR struct result *resulthashlist[BIGHASHSIZE];
VAR struct fwordtotal *fwordtotals[BIGHASHSIZE];
VAR struct filenum *filehashlist[BIGHASHSIZE];
VAR struct metaEntry* metaEntryList;

VAR long offsets[MAXCHARS];
VAR long hashoffsets[SEARCHHASHSIZE];
VAR int lenwordchars;
VAR char *wordchars;
VAR int wordcharslookuptable[256];
VAR int lenbeginchars;
VAR char *beginchars;
VAR int begincharslookuptable[256];
VAR int lenendchars;
VAR char *endchars;
VAR int endcharslookuptable[256];
VAR int lenignorelastchar;
VAR char *ignorelastchar;
VAR int ignorelastcharlookuptable[256];
VAR int lenignorefirstchar;
VAR char *ignorefirstchar;
VAR int ignorefirstcharlookuptable[256];
VAR int lentranslatechars1;
VAR char *translatechars1;
VAR int lentranslatechars2;
VAR char *translatechars2;
VAR char *savedasheader;
VAR int lensavedasheader;
VAR int totalwordsheader;
VAR int totalfilesheader;
VAR int verbose;
VAR int minwordlimit;
VAR int maxwordlimit;
VAR int bigrank;
VAR int beginhits;
VAR int maxhits;
VAR int totalwords;
VAR int totalfiles;
VAR int followsymlinks;
VAR int commonerror;
VAR int stopPos;
VAR int indexComments;
VAR int applyStemmingRules;					/* added 11/24/98 - MG */
VAR int applySoundexRules;                                    /* added 09/01/99 - DN */
VAR int useCustomOutputDelimiter;			/* added 11/24/98 - MG */
VAR int lencustomOutputDelimiter;
VAR char *customOutputDelimiter;	/* added 11/24/98 - MG */
VAR int ignoreTotalWordCountWhenRanking;	/* added 11/24/98 - MG */
VAR struct filter *filterlist;                  /* 1998-08-07 rasc */

VAR int lenfilterdir;
VAR char *filterdir;                  /* 1998-08-07 rasc */

VAR int lenindexedon;
VAR char *indexedon;
VAR int lenindexn;
VAR char *indexn;
VAR int lenindexd;
VAR char *indexd;
VAR int lenindexp;
VAR char *indexp;
VAR int lenindexa;
VAR char *indexa;
VAR char errorstr[MAXSTRLEN];

		/* 06/00 Jose Ruiz */
VAR int applyautomaticmetanames;
VAR int isvowellookuptable[256];

/* For IndexOnly var */
VAR struct swline *suffixlist;
#ifdef MAIN_FILE

char *indexchars ="abcdefghijklmnopqrstuvwxyz،ݟ&#;0123456789_\\|/-+=?!@$%^'\"`~,.<>[]{}";

char *defaultstopwords[] = {
"a", "above", "according", "across", "actually", "adj", "after", 
"afterwards", "again", "against", "all", "almost", "alone", "along", 
"already", "also", "although", "always", "among", "amongst", "an", "and", 
"another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "aren", 
"aren't", "around", "as", "at", "be", "became", "because", "become", "becomes", 
"becoming", "been", "before", "beforehand", "begin", "beginning", "behind", 
"being", "below", "beside", "besides", "between", "beyond", "billion", "both", 
"but", "by", "can", "can't", "cannot", "caption", "co", "could", "couldn",
"couldn't", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "don",
"don't", "down", "during", "each", "eg", "eight", "eighty", "either", "else",
"elsewhere", "end", "ending", "enough", "etc", "even", "ever", "every",
"everyone", "everything", "everywhere", "except", "few", "fifty", "first",
"five", "for", "former", "formerly", "forty", "found", "four", "from",
"further", "had", "has", "hasn", "hasn't", "have", "haven", "haven't",
"he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", 
"hers", "herself", "him", "himself", "his", "how", "however", "hundred", 
"ie", "i.e.", "if", "in", "inc", "inc.", "indeed", "instead", "into", "is",
"isn", "isn't", "it", "its", "itself", "last", "later", "latter", "latterly",
"least", "less", "let", "like", "likely", "ll", "ltd", "made", "make",
"makes", "many", "maybe", "me", "meantime", "meanwhile", "might", "million",
"miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must",
"my", "myself", "namely", "neither", "never", "nevertheless", "next", "nine",
"ninety", "no", "nobody", "none", "nonetheless", "noone", "nor", "not",
"nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one",
"only", "onto", "or", "others", "otherwise", "our", "ours",
"ourselves", "out", "over", "overall", "own", "per", "perhaps", "rather",
"re", "recent", "recently", "same", "seem", "seemed", "seeming", "seems",
"seven", "seventy", "several", "she", "should", "shouldn", "shouldn't",
"since", "six", "sixty", "so", "some", "somehow", "someone", "something",
"sometime", "sometimes", "somewhere", "still", "stop", "such", "taking",
"ten", "than", "that", "the", "their", "them", "themselves", "then",
"thence", "there", "thereafter", "thereby", "therefore", "therein",
"thereupon", "these", "they", "thirty", "this", "those", "though",
"thousand", "three", "through", "throughout", "thru", "thus", "to",
"together", "too", "toward", "towards", "trillion", "twenty", "two", "under",
"unless", "unlike", "unlikely", "until", "up", "upon", "us", "used", "using",
"ve", "very", "via", "was", "wasn", "we", "we", "well", "were", "weren",
"weren't", "what", "whatever", "when", "whence", "whenever", "where", 
"whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", 
"whether", "which", "while", "whither", "who", "whoever", "whole", "whom", 
"whomever", "whose", "why", "will", "with", "within", "without", "won", 
"would", "wouldn", "wouldn't", "yes", "yet", "you", "your", "yours",
"yourself", "yourselves", NULL };

char *entities[] = 
 { "", "&#32", " ", "", "&#33", "!", "&quot",
"&#34", "\"", "", "&#35", "#", "", "&#36", "$", "", "&#37", "%",
"&amp", "&#38", "&", "", "&#39", "'", "", "&#43", "+", "", "&#44",
",", "", "&#45", "-", "", "&#46", ".", "", "&#47", "/", "", "&#48",
"0", "", "&#49", "1", "", "&#50", "2", "", "&#51", "3", "", "&#52",
"4", "", "&#53", "5", "", "&#54", "6", "", "&#55", "7", "", "&#56",
"8", "", "&#57", "9", "", "&#58", "", "", "&#59", ";", "&lt", "&#60",
"<", "", "&#61", "=", "&gt", "&#62", ">", "", "&#63", "?", "", "&#64",
"@", "", "&#65", "A", "", "&#66", "B", "", "&#67", "C", "", "&#68",
"D", "", "&#69", "E", "", "&#70", "F", "", "&#71", "G", "", "&#72",
"H", "", "&#73", "I", "", "&#74", "J", "", "&#75", "K", "", "&#76",
"L", "", "&#77", "M", "", "&#78", "N", "", "&#79", "O", "", "&#80",
"P", "", "&#81", "Q", "", "&#82", "R", "", "&#83", "S", "", "&#84",
"T", "", "&#85", "U", "", "&#86", "V", "", "&#87", "W", "", "&#88",
"X", "", "&#89", "Y", "", "&#90", "Z", "", "&#91", "[", "", "&#92",
"\\", "", "&#93", "]", "", "&#94", "^", "", "&#95", "-", "", "&#96",
"`", "", "&#97", "a", "", "&#98", "b", "", "&#99", "c", "", "&#100",
"d", "", "&#101", "e", "", "&#102", "f", "", "&#103", "g", "",
"&#104", "h", "", "&#105", "i", "", "&#106", "j", "", "&#107", "k",
"", "&#108", "l", "", "&#109", "m", "", "&#110", "n", "", "&#111",
"o", "", "&#112", "p", "", "&#113", "q", "", "&#114", "r", "",
"&#115", "s", "", "&#116", "t", "", "&#117", "u", "", "&#118", "v",
"", "&#119", "w", "", "&#120", "x", "", "&#121", "y", "", "&#122",
"z", "", "&#123", "{", "", "&#124", "|", "", "&#125", "}", "",
"&#126", "~", "&nbsp", "&#160", " ", "&iexcl", "&#161", "", "&cent",
"&#162", "", "&pound", "&#163", "", "&curren", "&#164", "", "&yen",
"&#165", "", "&brvbar", "&#166", "", "&sect", "&#167", "", "&die",
"&#168", "", "&copy", "&#169", "", "&ordf", "&#170", "", "&laquo",
"&#171", "", "&not", "&#172", "", "&shy", "&#173", "", "&reg",
"&#174", "", "&macron", "&#175", "", "&degree", "&#176", "",
"&plusmn", "&#177", "", "&sup2", "&#178", "", "&sup3", "&#179", "",
"&acute", "&#180", "", "&micro", "&#181", "", "&mu", "&#182", "",
"&middot", "&#183", "", "&Cedilla", "&#184", "", "&sup1", "&#185", "",
"&ordm", "&#186", "", "&raquo", "&#187", "", "&frac14", "&#188", "",
"&frac12", "&#189", "", "&frac34", "&#190", "", "&iquest", "&#191",
"", "&Agrave", "&#192", "", "&Aacute", "&#193", "", "&Acirc",
"&#194", "", "&Atilde", "&#195", "", "&Auml", "&#196", "",
"&Aring", "&#197", "", "&AElig", "&#198", "", "&Ccedil", "&#199",
"", "&Egrave", "&#200", "", "&Eacute", "&#201", "", "&Ecirc",
"&#202", "", "&Euml", "&#203", "", "&Igrave", "&#204", "",
"&Iacute", "&#205", "", "&Icirc", "&#206", "", "&Iuml", "&#207",
"", "&ETH", "&#208", "", "&Ntilde", "&#209", "", "&Ograve", "&#210",
"", "&Oacute", "&#211", "", "&Ocirc", "&#212", "", "&Otilde",
"&#213", "", "&Ouml", "&#214", "", "&times", "&#215", "", "&Oslash",
"&#216", "", "&Ugrave", "&#217", "", "&Uacute", "&#218", "",
"&Ucirc", "&#219", "", "&Uuml", "&#220", "", "&Yacute", "&#221",
"", "&THORN", "&#222", "", "&szlig", "&#223", "", "&agrave", "&#224",
"", "&aacute", "&#225", "", "&acirc", "&#226", "", "&atilde",
"&#227", "", "&auml", "&#228", "", "&aring", "&#229", "", "&aelig",
"&#230", "", "&ccedil", "&#231", "", "&egrave", "&#232", "",
"&eacute", "&#233", "", "&ecirc", "&#234", "", "&euml", "&#235",
"", "&igrave", "&#236", "", "&iacute", "&#237", "", "&icirc",
"&#238", "", "&iuml", "&#239", "", "&eth", "&#240", "", "&ntilde",
"&#241", "", "&ograve", "&#242", "", "&oacute", "&#243", "",
"&ocirc", "&#244", "", "&otilde", "&#245", "", "&ouml", "&#246",
"", "&divide", "&#247", "", "&oslash", "&#248", "", "&ugrave",
"&#249", "", "&uacute", "&#250", "", "&ucirc", "&#251", "",
"&uuml", "&#252", "", "&yacute", "&#253", "", "&thorn", "&#254", "",
"&yuml", "&#255", "", NULL };
#else
extern char *indexchars;
extern char *defaultstopwords[];
extern char *entities[];
#endif

#ifdef MAIN_FILE

/*
** use _AP() for easier cross-compiler (non-ANSI) porting 
** <return value> <functionname> _AP( (<arg prototypes>) );
*/

int main _AP ((int, char **));
void usage _AP ((void));
void printversion _AP ((void));
void printrunning _AP ((long, long));

#endif

long getthetime _AP ((void));

/* use these to open Index files (because they are binary files: Win32)  */
FILE* openIndexFileForWrite _AP ((char* filename));
FILE* openIndexFileForRead _AP ((char* filename));
FILE* openIndexFileForReadAndWrite _AP ((char* filename));

void indexpath _AP((char *path));
int vgetc _AP((void *vp));
int vsize _AP((void *vp));
int vtell _AP((void *vp));
int vseek _AP((void *vp,long));
int parseconfline _AP((char *line));
void allocatedefaults _AP((void));

/* 04/00 Jose Ruiz
** Functions to read/write longs from index file
*/
void printlong _AP ((FILE *, long));
long readlong _AP ((FILE *));
/* strcpy doesn't check for overflow in the 'to' string */
/* strncpy doesn't guarantee null byte termination */
/* can't check strlen of 'from' arg since it is sometimes a function call */
#define safestrcpy(n,to,from) \
  { strncpy(to,from,n); to[n-1]='\0'; }

/* Jose Ruiz 04/00
** Now this function is a macro for better performance
** void compress _AP ((int, FILE *));
*/
#define compress(num,fp) \
{int i,r; char s[8]; \
i=0;r=num; \
while(r) {s[i++] = r % 128;r /= 128;}\
while(--i >=0) fputc(s[i] | (i ? 128 : 0), fp);}

/* Jose Ruiz 04/00
** Now this function is a macro for better performance
** int uncompress _AP ((FILE *));
*/
#define uncompress(num,fp) \
{int c;num = 0;\
do{ \
   c=(int)fgetc(fp);\
   num *= 128; num += c & 127;\
   if(!num) break;\
} while (c & 128);}

/* Jose Ruiz 04/00
** Macro for copying postions between arrays of integers
** copy num integers on dest (starting at posdest) from
** orig (starting at posorig)
*/
/* 
#define CopyPositions(dest,posdest,orig,posorig,num) \
{int i;for(i=0;i<num,i++) dest[i+posdest]=orig[i+posorig];}
*/
#define CopyPositions(dest,posdest,orig,posorig,num) \
memcpy((char *)((int *)dest+posdest),(char *)((int *)orig+posorig),num*sizeof(int))

/* Definitions of word commands and, or, not, ... */
/* Change them here */
#define AND_WORD "and"
#define OR_WORD "or"
#define NOT_WORD "not"
#define PHRASE_WORD "precd"
#define AND_NOT_WORD "andnot"

/* Delimiter of phrase search */
/* Change it if  you prefer a different one */
#define PHRASE_DELIMITER_CHAR '"'
#define PHRASE_DELIMITER_STRING "\""

