1 files changed, 498 insertions, 0 deletions
diff --git a/whichasm-0.01/whichasm.c b/whichasm-0.01/whichasm.c
new file mode 100644
index 0000000..f305b6a
--- /dev/null
+++ b/whichasm-0.01/whichasm.c
@@ -0,0 +1,498 @@
+/*
+ * whichasm - which assembly language does this file use?
+ *
+ * This is a not particularly intelligent tool for attempting to detect
+ * the assembly language used within a particular source file, using a
+ * simple heuristic that the most popular language wins. We don't (yet)
+ * handle multiple assembly languages in a given file, but we could.
+ *
+ * Copyright (C) 2012 Jon Masters <jcm@jonmasters.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 (only) of the GNU General
+ * Public License as published by the Free Software Foundation.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <regex.h>
+
+#include "classifier.h"
+
+#define CLASSIFIER_THRESHOLD 1
+
+typedef int (*classifier_fn)(char *);
+
+struct classifier
+{
+	classifier_fn fn;
+	const char *name;
+};
+
+static const struct classifier classifiers[] = {
+	{ classifier_arm,	"arm" },
+	{ classifier_ppc,	"ppc" },
+	{ classifier_s390x,	"s390x" },
+	{ classifier_x86,	"x86" },
+	{ NULL, NULL }
+};
+
+struct classifier_scores {
+	int mnemonic_index;
+	int register_index;
+	int score;
+};
+
+/*
+ * tokenize - split the input line into a list of simple tokens
+ * @source - a line of "source" code from the input file
+ */
+struct token *tokenize(char *source)
+{
+
+	int this_char, source_len, word_start, in_word, in_space;
+	struct token *tokens, *token, *last_token;
+
+	tokens = NULL;
+	token = NULL;
+	last_token = NULL;
+
+	word_start = in_word = 0;
+	in_space = 1; // start off thinking we're at a separator
+	source_len = strlen(source);
+	for (this_char = 0; this_char < source_len; this_char++) {
+		// state transition into a new word
+		if ((isalnum(source[this_char])) ||
+		    ('%' == source[this_char])) {
+			if (in_space) {
+				if (!in_word) {
+					word_start = this_char;
+				}
+				in_word = 1;
+				in_space = 0;
+			}
+		} else {
+		// state transition out of a word
+			if ((isspace(source[this_char])) ||
+			    (',' == source[this_char]))
+				in_space = 1;
+			else
+				in_space = 0;
+
+			if (in_word && (this_char > word_start)) {
+				token = malloc(sizeof(token));
+				if (!token) {
+					printf("error allocating memory\n");
+					exit(1);
+				}
+				token->name = malloc(this_char-word_start+1);
+				if (!token->name) {
+					printf("error allocating memory\n");
+					exit(1);
+				}
+				token->next = NULL;
+				strncpy(token->name,
+					&source[word_start],
+					this_char-word_start);
+				token->name[this_char-word_start]='\0';
+
+				if (tokens)
+					last_token->next = token;
+				else
+					tokens = token;
+				last_token = token;
+			}
+		in_word = 0;
+		}
+	}
+
+	// handle last or sole word specially
+	if ((in_word) && (this_char > word_start)) {
+		token = malloc(sizeof(token));
+		if (!token) {
+			printf("error allocating memory\n");
+			exit(1);
+		}
+		token->name = malloc(this_char-word_start+1);
+		if (!token->name) {
+			printf("error allocating memory\n");
+			exit(1);
+		}
+		token->next = NULL;
+		strncpy(token->name, &source[word_start], this_char-word_start);
+		token->name[this_char-word_start] = '\0';
+
+		if (tokens)
+			last_token->next = token;
+		else
+			tokens = token;
+		last_token = token;
+	}
+
+	return tokens;
+}
+
+/*
+ * free_tokens - free allocated list elements
+ * @tokens - the list of tokens
+ */
+int free_tokens(struct token *tokens)
+{
+	struct token *next_tokens;
+
+	while (tokens) {
+		next_tokens = tokens->next;
+		free(tokens->name);
+		free(tokens);
+		tokens = next_tokens;
+	}
+
+	return 0;
+}
+
+/*
+ * scan_tokens - parse tokens for assembly use
+ * @tokens - an input source line of tokens
+ */
+int scan_tokens(struct token *tokens)
+{
+	struct token *token;
+	int token_index, class;
+	int classifier_number;
+	const struct classifier *classifier;
+	struct classifier_scores classifier_scores[sizeof(classifiers)/
+						   sizeof(struct classifier)];
+	int winning_classifier;
+	int winning_score;
+
+	for (classifier_number=0,classifier=classifiers;
+	     classifier->fn;
+	     classifier++,classifier_number++) {
+
+		classifier_scores[classifier_number].mnemonic_index = -1;
+		classifier_scores[classifier_number].register_index = -1;
+		classifier_scores[classifier_number].score = 0;
+
+		for (token_index=0,token=tokens;
+		     token!=NULL;
+		     token=token->next,token_index++) {
+
+			// special case ignore '%' signs (e.g. registers)
+			if (strstr(token->name,"%") == token->name)
+				class = classifier->fn(&token->name[1]);
+			else
+				class = classifier->fn(token->name);
+			if (class == MNEMONIC) {
+				classifier_scores[classifier_number].score++;
+				classifier_scores[classifier_number].mnemonic_index = token_index;
+			}
+			if (class == REGISTER) {
+				classifier_scores[classifier_number].score++;
+				if (!classifier_scores[classifier_number].register_index)
+					classifier_scores[classifier_number].register_index = token_index;
+			}
+
+		}
+
+		if (classifier_scores[classifier_number].mnemonic_index < 0)
+			// No opcode was found - probably not a match
+			classifier_scores[classifier_number].score = 0;
+		if ((classifier_scores[classifier_number].register_index > 0) &&
+		    (classifier_scores[classifier_number].register_index <
+		     classifier_scores[classifier_number].mnemonic_index))
+			// Register came before opcode - probably not a match
+			classifier_scores[classifier_number].score = 0;
+
+	}
+
+	winning_classifier = -1;
+	winning_score = -1;
+
+	for (classifier_number=0,classifier=classifiers;
+	     classifier->fn;
+	     classifier++,classifier_number++) {
+
+		//printf("classifier %s score: %d\n",
+		//       classifier->name,
+		//       classifier_scores[classifier_number].score);
+	
+		if ((classifier_scores[classifier_number].score) &&
+		    (classifier_scores[classifier_number].score >
+		     winning_score))
+		{
+			winning_score =
+				classifier_scores[classifier_number].score;
+			winning_classifier = classifier_number;
+		}
+	}
+
+	return winning_classifier;
+}
+
+/*
+ * excluded_sourceline - filter out comment lines and assembler commands
+ * @source - a line of source (pre-tokenization)
+ * TODO: Implement this function
+ */
+int excluded_sourceline(char *source)
+{
+	//regex_t re_comment;
+
+	return 0;
+}
+
+/*
+ * scan_sourceline - parse one line of input (file) source
+ * source - textual string representation of an input line
+ */
+int scan_sourceline(char *source)
+{
+	struct token *tokens;
+	int winning_classifier = -1;
+
+	if (excluded_sourceline(source))
+		return -1;
+
+	tokens = tokenize(source);
+	winning_classifier = scan_tokens(tokens);
+	free_tokens(tokens);
+
+	return winning_classifier;
+}
+
+/*
+ * usage - print a usage message
+ * @progname - name of the program as executed
+ */
+int usage(char *progname) {
+
+	printf("Usage: %s [FILE]\n", progname);
+	printf("Scan the input file for known assembly languages\n");
+
+	return 0;
+}
+
+/* Used to store state during file scanning */
+struct sourcefile {
+	char *file_name;
+	char *file_ext;
+	FILE *file;
+	int mode;
+	int inside_asm;
+};	
+
+/*
+ * next_sourceline - get the next line to process
+ * @file - the open file object
+ * @mode - 0 returns every line, 1 skips asm sections
+ */
+char *next_sourceline(struct sourcefile *sourcefile, void *line, size_t len) {
+
+	regex_t re_asm_singleton1;
+	regex_t re_asm_singleton2;
+	regex_t re_asm_block_open1;
+	regex_t re_asm_block_open2;
+	regex_t re_asm_block_close;
+	regmatch_t pmatch[4];
+	int ret_re;
+	char match[255];
+
+	regcomp(&re_asm_singleton1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*\"(.*)\\);",
+                REG_EXTENDED|REG_ICASE);
+        regcomp(&re_asm_singleton2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*\"(.*)\\);",
+                REG_EXTENDED|REG_ICASE);
+
+        regcomp(&re_asm_block_open1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)",
+                REG_EXTENDED|REG_ICASE);
+        regcomp(&re_asm_block_open2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)",
+                REG_EXTENDED|REG_ICASE);
+        regcomp(&re_asm_block_close, ".*\\);",
+                REG_EXTENDED|REG_ICASE);
+
+	if (!sourcefile->mode)
+		line = fgets(line, len, sourcefile->file);
+	else {
+		do {
+			line = fgets(line, len, sourcefile->file);
+			if (!line)
+				break; // end of file
+
+			// already in a multiblock?
+			if (sourcefile->inside_asm) {
+				ret_re = regexec(&re_asm_block_close, line,
+					(sizeof(pmatch)/sizeof(regmatch_t)),
+                                 		 pmatch, 0);
+				if (0 == ret_re) {
+					sourcefile->inside_asm = 0;
+				} else
+					break; // use current line
+				
+                        }
+
+			// try matching a singleton
+			ret_re = regexec(&re_asm_singleton1, line,
+					 (sizeof(pmatch)/sizeof(regmatch_t)),
+					 pmatch, 0);
+
+			if (0 != ret_re) {
+				ret_re = regexec(&re_asm_singleton2, line,
+					 (sizeof(pmatch)/sizeof(regmatch_t)),
+						 pmatch, 0);
+			}
+
+			if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) {
+				strncpy(match, line+pmatch[1].rm_so,
+					pmatch[1].rm_eo-pmatch[1].rm_so);
+				match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0';
+				strncpy(line, match, strlen(match)+1);
+
+				break; // use current line
+			}
+
+			// try multiblock option
+			ret_re = regexec(&re_asm_block_open1, line,
+					 (sizeof(pmatch)/sizeof(regmatch_t)),
+					 pmatch, 0);
+			if (0 != ret_re)
+				ret_re = regexec(&re_asm_block_open2, line,
+					(sizeof(pmatch)/sizeof(regmatch_t)),
+						 pmatch, 0);
+
+			if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) {
+				strncpy(match, line+pmatch[1].rm_so,
+				pmatch[1].rm_eo-pmatch[1].rm_so);
+				match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0';
+				strncpy(line, match, strlen(match)+1);
+
+				sourcefile->inside_asm = 1;
+
+				break; // use current line
+			}
+
+		} while (line);
+
+		//if (line && sourcefile->inside_asm) {
+		//	printf("asm: %s\n", (char *)line);
+		//}
+
+	}
+
+	return line;
+}
+
+/*
+ * scan_sourcefile - open a file and scan it
+ */
+int scan_sourcefile(char *source_file_name) {
+
+	struct sourcefile sourcefile;
+	char input_line[255];
+	char *line;
+	int classifier_totals[sizeof(classifiers)/sizeof(struct classifier)];
+	int winning_classifier;
+	const struct classifier *classifier;
+	int classifier_number;
+	int overall_classifier;
+	int overall_total;
+
+	for (classifier_number=0,classifier=classifiers;
+	     classifier->fn;
+	     classifier++,classifier_number++) {
+		       classifier_totals[classifier_number] = 0;
+	}
+
+	sourcefile.mode = 0;
+	sourcefile.inside_asm = 0;
+	sourcefile.file_name = source_file_name;
+	sourcefile.file_ext = strrchr(sourcefile.file_name, '.')+1;
+
+	if ( 0 == strncasecmp(sourcefile.file_ext,"S", strlen("S"))) {
+		// assembly source file
+		sourcefile.mode = 0;
+	}
+	if (( 0 == strncasecmp(sourcefile.file_ext,"C", strlen("C"))) ||
+	    ( 0 == strncasecmp(sourcefile.file_ext,"H", strlen("H")))) {
+		// C source file
+		sourcefile.mode = 1;
+	}
+
+	sourcefile.file = fopen(sourcefile.file_name, "r");
+
+	while (!feof(sourcefile.file)) {
+		line = next_sourceline(&sourcefile, input_line,
+				       sizeof(input_line));
+		if (!line) {
+			if (!feof(sourcefile.file)) {
+				printf("error reading file\n");
+				exit(1);
+			} else
+				break;
+		}
+
+		winning_classifier = scan_sourceline(input_line);
+		//if (winning_classifier >= 0) {
+			// JCM - DEBUG
+			//printf("classifier: %s\n", classifiers[winning_classifier].name);
+			//printf("line: %s\n", input_line);
+		//}
+
+		if (winning_classifier >= 0) {
+			classifier_totals[winning_classifier]++;
+		}
+
+	}
+
+	overall_classifier = -1;
+	overall_total = -1;
+	for (classifier_number=0,classifier=classifiers;
+	     classifier->fn;
+	     classifier++,classifier_number++) {
+		if (classifier_totals[classifier_number] > overall_total) {
+			overall_classifier = classifier_number;
+			overall_total = classifier_totals[classifier_number];
+		}
+	}
+
+	if (overall_total > CLASSIFIER_THRESHOLD)
+		return overall_classifier;
+	else
+		return -1; // maybe not sure
+}
+
+/*
+ * main - entry point
+ */
+int main(int argc, char **argv)
+{
+
+	int winning_classifier;
+	char *source_file_name = argv[1];
+	char *source_file_ext;
+
+	if (argc != 2) {
+		usage(argv[0]);
+		exit(1);
+	}
+
+	source_file_ext = strrchr(source_file_name, '.');
+
+	if (!source_file_ext) {
+		printf("error: must be run on C or assembly source files\n");
+		usage(argv[0]);
+		exit(1);
+	}
+
+	winning_classifier = scan_sourcefile(source_file_name);
+
+	if (winning_classifier >= 0) {
+		printf("%s: %s\n",
+			source_file_name, classifiers[winning_classifier].name);
+	} else {
+		printf("%s: unknown\n", source_file_name);
+	}
+
+	exit(0);
+}