/* Stupid finite automata */
#include "defs.h"

static int  position;
static int  word = (-1);
static Bool savemode;
static Bool beginning;
static Bool lookup_quote;

static int  stopcondition;
#define BYQUOTE_DOUBLE		0
#define BYQUOTE_SINGLE		1
#define BYBREAKCHAR		2

static char remember[10000];

URL *basehref = NULL;

/* ONLY LOWERCASE LETTERS HERE !!! */
static  char *words[] = {
#define P_HREF                          0
	"href="      ,
#define P_BASEHREF                      1
	"base href=" ,
#define P_URL                           2
	"url="       ,

#define P_SRC                           3
	"src="       ,
#define P_LOWSRC                        4
	"lowsrc="    ,
#define P_BACKGROUND                    5
	"background=",

/* JavaScript */
#define P_OPEN1                         6
	"window.open(\"",
#define P_OPEN2                         7
	"window.open(\'",
#define P_VALUE				8
	"value=",

/* Java */
#define P_JAVA_CLASS			9
	"code=",
/* JavaScript */
#define P_JAVASCRIPT_ONCLICK		10
	"onclick=",
#define P_JAVASCRIPT_ONMOUSEOVER	11
	"onmouseover=",

#define P_ARRAY_ELEMENT			12
	"]=",

/* Garbage for scripts */
#define P_DATA				13
	"data=",

};
static int NWORDS = sizeof(words)/sizeof(words[0]);

void onlyHref(){
	words   [P_SRC] = NULL;
	NWORDS = P_SRC;
}

Bool isQuote(unsigned c){
	return(c == '"' || c == '\'');
}
Bool isBreakChar(unsigned c){
	if(nobreak_amp && (c == '&' || c == ';')) return FALSE;		/* &amp; */
	return ((break_on_spaces && isspace(c)) || strchr("\"'<>;&", c) != NULL) ? TRUE : FALSE;

	/* . # _   are NOT breakchars */
}
void resetParser(){
	position	= 0;
	savemode	= FALSE;
	beginning	= FALSE;
	stopcondition	= BYQUOTE_DOUBLE;
	word		= (-1);    /* none */
	lookup_quote	= FALSE;
}

/* <A HREF="href="javascript:imgWindow('t/31.jpg')">
 */
WORD JavaScriptWords[] = {
	{ "javascript:imgWindow('",	0},
	{ "javascript:openwin('",	0},
	{ "javascript:GoToSld('",	0},
	{ "javascript:open_help_win('",	0},
	{ "javascript:location=\"",	0},
	{ "javascript:LaunchView('",	0},
};
char JavaScript[] = "javascript:";

#define NJAVAWORDS (sizeof(JavaScriptWords)/sizeof(JavaScriptWords[0]))
void initWORDs(){
	int x;
	for (x=0; x < NJAVAWORDS; x++)
		JavaScriptWords[x].slen = strlen(JavaScriptWords[x].s);
}
void rememberURL(List *list, char *href){
	int x;
	char *ss;

	if(word == P_BASEHREF){
		basehref = NULL;
		basehref = addURL(list, href, FORCED|BASEHREF);

		fprintf(fplog, "\t@@@ BASE HREF=\"%s\"\n", href);

		if(basehref)
			parseName(basehref);

		/* The explicit call is needed here
		   for the case of the reference base href=ftp://....
		   to have urlname and hostname fields known (parsed).
		 */
	} else {
		/* SPECIAL ANALYSIS FOR THE FUNCTIONS IN THE HREF */
		if(word == P_HREF || word == P_JAVASCRIPT_ONCLICK ||
		   word == P_JAVASCRIPT_ONMOUSEOVER || word == P_ARRAY_ELEMENT){
			int do_lookup = 0;
#if 0
			for(x=0; x < NJAVAWORDS; x++){
				if(!strncasecmp(href, JavaScriptWords[x].s, JavaScriptWords[x].slen)){
					href += JavaScriptWords[x].slen;
					ss = strchr(href, '\'');
					if(ss) *ss = '\0';	/* Closing quote for 'name' */
					break;
				}
			}
#endif
			if(power_tab_archive && !strncmp(href, "DownloadTab(", 12)){
				unsigned long a, b, c;
				char bbb[1024];

				href += 12;
				sscanf(href, "%ld, %ld, %ld)", &a, &b, &c);
				sprintf(bbb, "http://www.powertabs.net/pta.php?page=tab_download,%ld,%ld,%ld", a, b, c);
				(void) addURL(list, bbb, NoFlags);
				return;
			}
			if(power_tab_archive && !strncmp(href, "DownloadATab(", 13)){
				unsigned long a, b, c;
				char bbb[1024];

				href += 13;
				sscanf(href, "%ld, %ld, %ld)", &a, &b, &c);
				sprintf(bbb, "http://www.powertabs.net/pta.php?page=tab_download,%ld,%ld,%ld", a, b, c);
				(void) addURL(list, bbb, NoFlags);
				return;
			}
			if(!strncasecmp(href, JavaScript, sizeof(JavaScript)-1)){
				href += sizeof(JavaScript)-1;
				while(isspace(*href)) href++;	/* javascript: f() */
				do_lookup++;
			}
			if(word == P_JAVASCRIPT_ONCLICK ||
			   word == P_JAVASCRIPT_ONMOUSEOVER || word == P_ARRAY_ELEMENT)
				do_lookup++;

			if(do_lookup){
				ss = strchr(href, '\'');	/* Opening quote for 'name' */
				if(ss){
					href = ss+1;
					ss = strchr(href, '\'');
					if(ss) *ss = '\0';	/* Closing quote for 'name' */

					fprintf(fplog, "\t\tJAVASCRIPT REF '%s'\n", href);
				} else {
					ss = strchr(href, '"');
					if(ss){
						href = ss+1;
						ss = strchr(href, '"');
						if(ss) *ss = '\0';	/* Closing quote for 'name' */
						fprintf(fplog, "\t\tJAVASCRIPT REF \"%s\"\n", href);
					}
				}
			}
		}
		(void) addURL(list, href, NoFlags);
	}
}
int isStopper(int c, int quote){
	if(trimURLspaces || nobreak_spaces)
		return (c == quote || c == '>' || c == '<');
		/*            #1            #2          #3	*/
	else
		return (c == quote || c == '>' || c == '<' || c == ' ' || c == '\t');
		/*            #1            #2          #3          #4          #4    */
}
void checkChar(List *list, unsigned c){
	int i;
	char *add;

	switch(savemode){
	case TRUE:
		if(lookup_quote && !isQuote(c)){
			remember[position++] = c;
			break;
		}
		if(beginning == TRUE){      /* the very beginning: first char after HREF= */
			if(c == '"') {
				stopcondition = BYQUOTE_DOUBLE;

				/* but don't save quote char itself - no need */
				/* unless we are in the long mode */
				if(lookup_quote) remember[position++] = c;
			}
			else if(c == '\'') {
				stopcondition = BYQUOTE_SINGLE;
				if(lookup_quote) remember[position++] = c;
			}
			else {
				stopcondition = BYBREAKCHAR;
				if(isspace(c)){
					/*
					 * For the mad guys who write
					 * <IMG SRC=    picture.gif >
					 *          ^^^^
					 */

					/* Keep state beginning == TRUE */
					return;
				}
				if(isBreakChar(c)){
					fprintf(fplog, "\tREF %s\"\"\n", words[word]);
					fprintf(fplog,
					   "\t!!! Warning: first char is a breakchar '%c'\n",
					   c);

					savemode = FALSE;
					/* do NOT do rememberURL() !!! */

					return;
				}
				remember[position++] = c;
			}
			/* else */
			beginning = FALSE;

		} else {
			/* check the stop condition */
			if(stopcondition == BYQUOTE_DOUBLE && isStopper(c, '"')){
				/*
				    #1 - is a correct ending           <A HREF="...">
				    #2 - is a bugfix for the lost "    <A HREF="...>
				    #3 - is a bugfix for the lost ">   <A HREF="... <BR>
				    #4 - is a bugfix for the lost "    <IMG SRC="... ALT="...">
				*/

				remember[position] = '\0';
				fprintf(fplog, "\tREF %s\"%s\"\n", words[word], remember);

				if(c != '"')
					fprintf(fplog, "\t!!! Warning: bad ending quote '%c'\n", c);

				for(add=remember; isspace(*add); add++);

				rememberURL(list, add);

				savemode = FALSE;

				/* don't retry the quote itself */
			} else
			if(stopcondition == BYQUOTE_SINGLE && isStopper(c, '\'')){
				/*
				    #1 - is a correct ending           <A HREF="...">
				    #2 - is a bugfix for the lost "    <A HREF="...>
				    #3 - is a bugfix for the lost ">   <A HREF="... <BR>
				    #4 - is a bugfix for the lost "    <IMG SRC="... ALT="...">
				*/

				remember[position] = '\0';
				fprintf(fplog, "\tREF %s\"%s\"\n", words[word], remember);

				if(c != '\'')
					fprintf(fplog, "\t!!! Warning: bad ending quote '%c'\n", c);

				for(add=remember; isspace(*add); add++);

				rememberURL(list, add);

				savemode = FALSE;

				/* don't retry the quote itself */


			} else if(stopcondition == BYBREAKCHAR && isBreakChar(c)){

				remember[position] = '\0';

				fprintf(fplog, "\tREF %s\"%s\"\n", words[word], remember);

				if( !nowarnflag)
					fprintf(fplog, "\t!!! Warning: HREF must be enclosed into \"...\"\n");

				for(add=remember; isspace(*add); add++);

				rememberURL(list, add);

				savemode = FALSE;

				/* don't retry the break char itself - since our break set
				 * can't be a beginning of HREF= sequence
				 */

			} else if((stopcondition == BYQUOTE_DOUBLE || stopcondition == BYQUOTE_SINGLE)
					&& (c=='\n' || c=='\r') ){
				/*
				 * <A HREF="http://host/wawa/inde
				 * x.html">
				 *
				 *      (sample from the Netscape's WWW server)
				 *      What to do? Ignore space!
				 *
				 *      HOWEVER the space character IS allowed!!!
				 *
				 * HREF="Such document name is legal"
				 *
				 *      This is MY understanding of this issue...
				 *      but take a look at the Netscape's doc errors!!!
				 *
				 * <IMG SRC="image3C.gif ALT="Jim's first snowman, Pierre">
				 *                      |__lost closing quote
				 *
				 *      What to do? Add ' ' to the ending quotes list!!!
				 */

				;

				fprintf(fplog, "\t!!! Warning: wild character \\%03o in HREF\n",
					       c);
				/* don't remember it ! */

			} else {

				remember[position++] = c;
			}
		}
		break;

	case FALSE:    /* analysis stage */

		if(isupper(c)) c = tolower(c);

check_other_word:

		if(word >= 0){

			if(isspace(c) && words[word][position] == '='){
				/* The mad case of HREF = "...."
						       ^ ^
						      spaces

				   Don't change position.
				 */
				if(!nowarnflag)
					fprintf(fplog, "\t!!! Warning: wild space after HREF/SRC =\n");
				return;
			}

			/* we have a candidate. check against it */
			if(c == (words[word][position] & 0xFF)){
				position++;

				if(position == strlen(words[word])){
					/* OK, the whole word is detected! */

					savemode  = TRUE;
					beginning = TRUE;
					position = 0;
					lookup_quote = FALSE;
					/* now "position" is a counter for saved chars */

					if(word == P_ARRAY_ELEMENT && parse_arrays){
						lookup_quote = TRUE;
					}

					return;
				}

			} else if(word == P_BASEHREF && position == 2 && c == 'c'){
				/*
					012...
					base href
					background
					  ^
				 */
				word = P_BACKGROUND;
				goto check_other_word;

			} else if(word == P_OPEN1 && position == 12 && c == '\''){
				/*
					01234567890123
					window.open("
						    ^        position == 12
				 */
				word = P_OPEN2;
				goto check_other_word;

			} else if(word == P_JAVASCRIPT_ONCLICK && position == 2 && c == 'm'){
				/*
					01234567890123
					onclick=
					onmouseover=
					  ^
			 	*/
				word = P_JAVASCRIPT_ONMOUSEOVER;
				goto check_other_word;
					
			} else {
				resetParser();  /* discard */
				/* and re-check THIS character */
				goto check_again;
			}

		} else {
			/* check if we have to begin */
		check_again:
			for(i=0; i < NWORDS; i++){
				if(c == (words[i][0] & 0xFF)){
					/* found */

					position=0;
					position++;

					word = i;
					/* our candidate:
					 *      all of them MUST begin with
					 *      the different letters
					 */
					return;
				}
			}
		}
		break;
	}
}
