/*
 * whichasm - which assembly language does this file use?
 *
 * This is a not particularly intelligent tool for attempting to detect
 * the assembly language used within a particular source file, using a
 * simple heuristic that the most popular language wins. We don't (yet)
 * handle multiple assembly languages in a given file, but we could.
 *
 * Copyright (C) 2012 Jon Masters <jcm@jonmasters.org>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 (only) of the GNU General
 * Public License as published by the Free Software Foundation.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <regex.h>

#include "classifier.h"

#define CLASSIFIER_THRESHOLD 1

typedef int (*classifier_fn)(char *);

struct classifier
{
	classifier_fn fn;
	const char *name;
};

static const struct classifier classifiers[] = {
	{ classifier_arm,	"arm" },
	{ classifier_ppc,	"ppc" },
	{ classifier_s390x,	"s390x" },
	{ classifier_x86,	"x86" },
	{ NULL, NULL }
};

struct classifier_scores {
	int mnemonic_index;
	int register_index;
	int score;
};

/*
 * tokenize - split the input line into a list of simple tokens
 * @source - a line of "source" code from the input file
 */
struct token *tokenize(char *source)
{

	int this_char, source_len, word_start, in_word, in_space;
	struct token *tokens, *token, *last_token;

	tokens = NULL;
	token = NULL;
	last_token = NULL;

	word_start = in_word = 0;
	in_space = 1; // start off thinking we're at a separator
	source_len = strlen(source);
	for (this_char = 0; this_char < source_len; this_char++) {
		// state transition into a new word
		if ((isalnum(source[this_char])) ||
		    ('%' == source[this_char])) {
			if (in_space) {
				if (!in_word) {
					word_start = this_char;
				}
				in_word = 1;
				in_space = 0;
			}
		} else {
		// state transition out of a word
			if ((isspace(source[this_char])) ||
			    (',' == source[this_char]))
				in_space = 1;
			else
				in_space = 0;

			if (in_word && (this_char > word_start)) {
				token = malloc(sizeof(token));
				if (!token) {
					printf("error allocating memory\n");
					exit(1);
				}
				token->name = malloc(this_char-word_start+1);
				if (!token->name) {
					printf("error allocating memory\n");
					exit(1);
				}
				token->next = NULL;
				strncpy(token->name,
					&source[word_start],
					this_char-word_start);
				token->name[this_char-word_start]='\0';

				if (tokens)
					last_token->next = token;
				else
					tokens = token;
				last_token = token;
			}
		in_word = 0;
		}
	}

	// handle last or sole word specially
	if ((in_word) && (this_char > word_start)) {
		token = malloc(sizeof(token));
		if (!token) {
			printf("error allocating memory\n");
			exit(1);
		}
		token->name = malloc(this_char-word_start+1);
		if (!token->name) {
			printf("error allocating memory\n");
			exit(1);
		}
		token->next = NULL;
		strncpy(token->name, &source[word_start], this_char-word_start);
		token->name[this_char-word_start] = '\0';

		if (tokens)
			last_token->next = token;
		else
			tokens = token;
		last_token = token;
	}

	return tokens;
}

/*
 * free_tokens - free allocated list elements
 * @tokens - the list of tokens
 */
int free_tokens(struct token *tokens)
{
	struct token *next_tokens;

	while (tokens) {
		next_tokens = tokens->next;
		free(tokens->name);
		free(tokens);
		tokens = next_tokens;
	}

	return 0;
}

/*
 * scan_tokens - parse tokens for assembly use
 * @tokens - an input source line of tokens
 */
int scan_tokens(struct token *tokens)
{
	struct token *token;
	int token_index, class;
	int classifier_number;
	const struct classifier *classifier;
	struct classifier_scores classifier_scores[sizeof(classifiers)/
						   sizeof(struct classifier)];
	int winning_classifier;
	int winning_score;

	for (classifier_number=0,classifier=classifiers;
	     classifier->fn;
	     classifier++,classifier_number++) {

		classifier_scores[classifier_number].mnemonic_index = -1;
		classifier_scores[classifier_number].register_index = -1;
		classifier_scores[classifier_number].score = 0;

		for (token_index=0,token=tokens;
		     token!=NULL;
		     token=token->next,token_index++) {

			// special case ignore '%' signs (e.g. registers)
			if (strstr(token->name,"%") == token->name)
				class = classifier->fn(&token->name[1]);
			else
				class = classifier->fn(token->name);
			if (class == MNEMONIC) {
				classifier_scores[classifier_number].score++;
				classifier_scores[classifier_number].mnemonic_index = token_index;
			}
			if (class == REGISTER) {
				classifier_scores[classifier_number].score++;
				if (!classifier_scores[classifier_number].register_index)
					classifier_scores[classifier_number].register_index = token_index;
			}

		}

		if (classifier_scores[classifier_number].mnemonic_index < 0)
			// No opcode was found - probably not a match
			classifier_scores[classifier_number].score = 0;
		if ((classifier_scores[classifier_number].register_index > 0) &&
		    (classifier_scores[classifier_number].register_index <
		     classifier_scores[classifier_number].mnemonic_index))
			// Register came before opcode - probably not a match
			classifier_scores[classifier_number].score = 0;

	}

	winning_classifier = -1;
	winning_score = -1;

	for (classifier_number=0,classifier=classifiers;
	     classifier->fn;
	     classifier++,classifier_number++) {

		//printf("classifier %s score: %d\n",
		//       classifier->name,
		//       classifier_scores[classifier_number].score);
	
		if ((classifier_scores[classifier_number].score) &&
		    (classifier_scores[classifier_number].score >
		     winning_score))
		{
			winning_score =
				classifier_scores[classifier_number].score;
			winning_classifier = classifier_number;
		}
	}

	return winning_classifier;
}

/*
 * excluded_sourceline - filter out comment lines and assembler commands
 * @source - a line of source (pre-tokenization)
 * TODO: Implement this function
 */
int excluded_sourceline(char *source)
{
	//regex_t re_comment;

	return 0;
}

/*
 * scan_sourceline - parse one line of input (file) source
 * source - textual string representation of an input line
 */
int scan_sourceline(char *source)
{
	struct token *tokens;
	int winning_classifier = -1;

	if (excluded_sourceline(source))
		return -1;

	tokens = tokenize(source);
	winning_classifier = scan_tokens(tokens);
	free_tokens(tokens);

	return winning_classifier;
}

/*
 * usage - print a usage message
 * @progname - name of the program as executed
 */
int usage(char *progname) {

	printf("Usage: %s [FILE]\n", progname);
	printf("Scan the input file for known assembly languages\n");

	return 0;
}

/* Used to store state during file scanning */
struct sourcefile {
	char *file_name;
	char *file_ext;
	FILE *file;
	int mode;
	int inside_asm;
};	

/*
 * next_sourceline - get the next line to process
 * @file - the open file object
 * @mode - 0 returns every line, 1 skips asm sections
 */
char *next_sourceline(struct sourcefile *sourcefile, void *line, size_t len) {

	regex_t re_asm_singleton1;
	regex_t re_asm_singleton2;
	regex_t re_asm_block_open1;
	regex_t re_asm_block_open2;
	regex_t re_asm_block_close;
	regmatch_t pmatch[4];
	int ret_re;
	char match[255];

	regcomp(&re_asm_singleton1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*\"(.*)\\);",
                REG_EXTENDED|REG_ICASE);
        regcomp(&re_asm_singleton2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*\"(.*)\\);",
                REG_EXTENDED|REG_ICASE);

        regcomp(&re_asm_block_open1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)",
                REG_EXTENDED|REG_ICASE);
        regcomp(&re_asm_block_open2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)",
                REG_EXTENDED|REG_ICASE);
        regcomp(&re_asm_block_close, ".*\\);",
                REG_EXTENDED|REG_ICASE);

	if (!sourcefile->mode)
		line = fgets(line, len, sourcefile->file);
	else {
		do {
			line = fgets(line, len, sourcefile->file);
			if (!line)
				break; // end of file

			// already in a multiblock?
			if (sourcefile->inside_asm) {
				ret_re = regexec(&re_asm_block_close, line,
					(sizeof(pmatch)/sizeof(regmatch_t)),
                                 		 pmatch, 0);
				if (0 == ret_re) {
					sourcefile->inside_asm = 0;
				} else
					break; // use current line
				
                        }

			// try matching a singleton
			ret_re = regexec(&re_asm_singleton1, line,
					 (sizeof(pmatch)/sizeof(regmatch_t)),
					 pmatch, 0);

			if (0 != ret_re) {
				ret_re = regexec(&re_asm_singleton2, line,
					 (sizeof(pmatch)/sizeof(regmatch_t)),
						 pmatch, 0);
			}

			if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) {
				strncpy(match, line+pmatch[1].rm_so,
					pmatch[1].rm_eo-pmatch[1].rm_so);
				match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0';
				strncpy(line, match, strlen(match)+1);

				break; // use current line
			}

			// try multiblock option
			ret_re = regexec(&re_asm_block_open1, line,
					 (sizeof(pmatch)/sizeof(regmatch_t)),
					 pmatch, 0);
			if (0 != ret_re)
				ret_re = regexec(&re_asm_block_open2, line,
					(sizeof(pmatch)/sizeof(regmatch_t)),
						 pmatch, 0);

			if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) {
				strncpy(match, line+pmatch[1].rm_so,
				pmatch[1].rm_eo-pmatch[1].rm_so);
				match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0';
				strncpy(line, match, strlen(match)+1);

				sourcefile->inside_asm = 1;

				break; // use current line
			}

		} while (line);

		//if (line && sourcefile->inside_asm) {
		//	printf("asm: %s\n", (char *)line);
		//}

	}

	return line;
}

/*
 * scan_sourcefile - open a file and scan it
 */
int scan_sourcefile(char *source_file_name) {

	struct sourcefile sourcefile;
	char input_line[255];
	char *line;
	int classifier_totals[sizeof(classifiers)/sizeof(struct classifier)];
	int winning_classifier;
	const struct classifier *classifier;
	int classifier_number;
	int overall_classifier;
	int overall_total;

	for (classifier_number=0,classifier=classifiers;
	     classifier->fn;
	     classifier++,classifier_number++) {
		       classifier_totals[classifier_number] = 0;
	}

	sourcefile.mode = 0;
	sourcefile.inside_asm = 0;
	sourcefile.file_name = source_file_name;
	sourcefile.file_ext = strrchr(sourcefile.file_name, '.')+1;

	if ( 0 == strncasecmp(sourcefile.file_ext,"S", strlen("S"))) {
		// assembly source file
		sourcefile.mode = 0;
	}
	if (( 0 == strncasecmp(sourcefile.file_ext,"C", strlen("C"))) ||
	    ( 0 == strncasecmp(sourcefile.file_ext,"H", strlen("H")))) {
		// C source file
		sourcefile.mode = 1;
	}

	sourcefile.file = fopen(sourcefile.file_name, "r");

	while (!feof(sourcefile.file)) {
		line = next_sourceline(&sourcefile, input_line,
				       sizeof(input_line));
		if (!line) {
			if (!feof(sourcefile.file)) {
				printf("error reading file\n");
				exit(1);
			} else
				break;
		}

		winning_classifier = scan_sourceline(input_line);
		//if (winning_classifier >= 0) {
			// JCM - DEBUG
			//printf("classifier: %s\n", classifiers[winning_classifier].name);
			//printf("line: %s\n", input_line);
		//}

		if (winning_classifier >= 0) {
			classifier_totals[winning_classifier]++;
		}

	}

	overall_classifier = -1;
	overall_total = -1;
	for (classifier_number=0,classifier=classifiers;
	     classifier->fn;
	     classifier++,classifier_number++) {
		if (classifier_totals[classifier_number] > overall_total) {
			overall_classifier = classifier_number;
			overall_total = classifier_totals[classifier_number];
		}
	}

	if (overall_total > CLASSIFIER_THRESHOLD)
		return overall_classifier;
	else
		return -1; // maybe not sure
}

/*
 * main - entry point
 */
int main(int argc, char **argv)
{

	int winning_classifier;
	char *source_file_name = argv[1];
	char *source_file_ext;

	if (argc != 2) {
		usage(argv[0]);
		exit(1);
	}

	source_file_ext = strrchr(source_file_name, '.');

	if (!source_file_ext) {
		printf("error: must be run on C or assembly source files\n");
		usage(argv[0]);
		exit(1);
	}

	winning_classifier = scan_sourcefile(source_file_name);

	if (winning_classifier >= 0) {
		printf("%s: %s\n",
			source_file_name, classifiers[winning_classifier].name);
	} else {
		printf("%s: unknown\n", source_file_name);
	}

	exit(0);
}