summaryrefslogtreecommitdiffstats
path: root/whichasm-0.01/whichasm.c
diff options
context:
space:
mode:
Diffstat (limited to 'whichasm-0.01/whichasm.c')
-rw-r--r--whichasm-0.01/whichasm.c498
1 files changed, 498 insertions, 0 deletions
diff --git a/whichasm-0.01/whichasm.c b/whichasm-0.01/whichasm.c
new file mode 100644
index 0000000..f305b6a
--- /dev/null
+++ b/whichasm-0.01/whichasm.c
@@ -0,0 +1,498 @@
+/*
+ * whichasm - which assembly language does this file use?
+ *
+ * This is a not particularly intelligent tool for attempting to detect
+ * the assembly language used within a particular source file, using a
+ * simple heuristic that the most popular language wins. We don't (yet)
+ * handle multiple assembly languages in a given file, but we could.
+ *
+ * Copyright (C) 2012 Jon Masters <jcm@jonmasters.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 (only) of the GNU General
+ * Public License as published by the Free Software Foundation.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <regex.h>
+
+#include "classifier.h"
+
+#define CLASSIFIER_THRESHOLD 1
+
+typedef int (*classifier_fn)(char *);
+
+struct classifier
+{
+ classifier_fn fn;
+ const char *name;
+};
+
+static const struct classifier classifiers[] = {
+ { classifier_arm, "arm" },
+ { classifier_ppc, "ppc" },
+ { classifier_s390x, "s390x" },
+ { classifier_x86, "x86" },
+ { NULL, NULL }
+};
+
+struct classifier_scores {
+ int mnemonic_index;
+ int register_index;
+ int score;
+};
+
+/*
+ * tokenize - split the input line into a list of simple tokens
+ * @source - a line of "source" code from the input file
+ */
+struct token *tokenize(char *source)
+{
+
+ int this_char, source_len, word_start, in_word, in_space;
+ struct token *tokens, *token, *last_token;
+
+ tokens = NULL;
+ token = NULL;
+ last_token = NULL;
+
+ word_start = in_word = 0;
+ in_space = 1; // start off thinking we're at a separator
+ source_len = strlen(source);
+ for (this_char = 0; this_char < source_len; this_char++) {
+ // state transition into a new word
+ if ((isalnum(source[this_char])) ||
+ ('%' == source[this_char])) {
+ if (in_space) {
+ if (!in_word) {
+ word_start = this_char;
+ }
+ in_word = 1;
+ in_space = 0;
+ }
+ } else {
+ // state transition out of a word
+ if ((isspace(source[this_char])) ||
+ (',' == source[this_char]))
+ in_space = 1;
+ else
+ in_space = 0;
+
+ if (in_word && (this_char > word_start)) {
+ token = malloc(sizeof(token));
+ if (!token) {
+ printf("error allocating memory\n");
+ exit(1);
+ }
+ token->name = malloc(this_char-word_start+1);
+ if (!token->name) {
+ printf("error allocating memory\n");
+ exit(1);
+ }
+ token->next = NULL;
+ strncpy(token->name,
+ &source[word_start],
+ this_char-word_start);
+ token->name[this_char-word_start]='\0';
+
+ if (tokens)
+ last_token->next = token;
+ else
+ tokens = token;
+ last_token = token;
+ }
+ in_word = 0;
+ }
+ }
+
+ // handle last or sole word specially
+ if ((in_word) && (this_char > word_start)) {
+ token = malloc(sizeof(token));
+ if (!token) {
+ printf("error allocating memory\n");
+ exit(1);
+ }
+ token->name = malloc(this_char-word_start+1);
+ if (!token->name) {
+ printf("error allocating memory\n");
+ exit(1);
+ }
+ token->next = NULL;
+ strncpy(token->name, &source[word_start], this_char-word_start);
+ token->name[this_char-word_start] = '\0';
+
+ if (tokens)
+ last_token->next = token;
+ else
+ tokens = token;
+ last_token = token;
+ }
+
+ return tokens;
+}
+
+/*
+ * free_tokens - free allocated list elements
+ * @tokens - the list of tokens
+ */
+int free_tokens(struct token *tokens)
+{
+ struct token *next_tokens;
+
+ while (tokens) {
+ next_tokens = tokens->next;
+ free(tokens->name);
+ free(tokens);
+ tokens = next_tokens;
+ }
+
+ return 0;
+}
+
+/*
+ * scan_tokens - parse tokens for assembly use
+ * @tokens - an input source line of tokens
+ */
+int scan_tokens(struct token *tokens)
+{
+ struct token *token;
+ int token_index, class;
+ int classifier_number;
+ const struct classifier *classifier;
+ struct classifier_scores classifier_scores[sizeof(classifiers)/
+ sizeof(struct classifier)];
+ int winning_classifier;
+ int winning_score;
+
+ for (classifier_number=0,classifier=classifiers;
+ classifier->fn;
+ classifier++,classifier_number++) {
+
+ classifier_scores[classifier_number].mnemonic_index = -1;
+ classifier_scores[classifier_number].register_index = -1;
+ classifier_scores[classifier_number].score = 0;
+
+ for (token_index=0,token=tokens;
+ token!=NULL;
+ token=token->next,token_index++) {
+
+ // special case ignore '%' signs (e.g. registers)
+ if (strstr(token->name,"%") == token->name)
+ class = classifier->fn(&token->name[1]);
+ else
+ class = classifier->fn(token->name);
+ if (class == MNEMONIC) {
+ classifier_scores[classifier_number].score++;
+ classifier_scores[classifier_number].mnemonic_index = token_index;
+ }
+ if (class == REGISTER) {
+ classifier_scores[classifier_number].score++;
+ if (!classifier_scores[classifier_number].register_index)
+ classifier_scores[classifier_number].register_index = token_index;
+ }
+
+ }
+
+ if (classifier_scores[classifier_number].mnemonic_index < 0)
+ // No opcode was found - probably not a match
+ classifier_scores[classifier_number].score = 0;
+ if ((classifier_scores[classifier_number].register_index > 0) &&
+ (classifier_scores[classifier_number].register_index <
+ classifier_scores[classifier_number].mnemonic_index))
+ // Register came before opcode - probably not a match
+ classifier_scores[classifier_number].score = 0;
+
+ }
+
+ winning_classifier = -1;
+ winning_score = -1;
+
+ for (classifier_number=0,classifier=classifiers;
+ classifier->fn;
+ classifier++,classifier_number++) {
+
+ //printf("classifier %s score: %d\n",
+ // classifier->name,
+ // classifier_scores[classifier_number].score);
+
+ if ((classifier_scores[classifier_number].score) &&
+ (classifier_scores[classifier_number].score >
+ winning_score))
+ {
+ winning_score =
+ classifier_scores[classifier_number].score;
+ winning_classifier = classifier_number;
+ }
+ }
+
+ return winning_classifier;
+}
+
+/*
+ * excluded_sourceline - filter out comment lines and assembler commands
+ * @source - a line of source (pre-tokenization)
+ * TODO: Implement this function
+ */
+int excluded_sourceline(char *source)
+{
+ //regex_t re_comment;
+
+ return 0;
+}
+
+/*
+ * scan_sourceline - parse one line of input (file) source
+ * source - textual string representation of an input line
+ */
+int scan_sourceline(char *source)
+{
+ struct token *tokens;
+ int winning_classifier = -1;
+
+ if (excluded_sourceline(source))
+ return -1;
+
+ tokens = tokenize(source);
+ winning_classifier = scan_tokens(tokens);
+ free_tokens(tokens);
+
+ return winning_classifier;
+}
+
+/*
+ * usage - print a usage message
+ * @progname - name of the program as executed
+ */
+int usage(char *progname) {
+
+ printf("Usage: %s [FILE]\n", progname);
+ printf("Scan the input file for known assembly languages\n");
+
+ return 0;
+}
+
+/* Used to store state during file scanning */
+struct sourcefile {
+ char *file_name;
+ char *file_ext;
+ FILE *file;
+ int mode;
+ int inside_asm;
+};
+
+/*
+ * next_sourceline - get the next line to process
+ * @file - the open file object
+ * @mode - 0 returns every line, 1 skips asm sections
+ */
+char *next_sourceline(struct sourcefile *sourcefile, void *line, size_t len) {
+
+ regex_t re_asm_singleton1;
+ regex_t re_asm_singleton2;
+ regex_t re_asm_block_open1;
+ regex_t re_asm_block_open2;
+ regex_t re_asm_block_close;
+ regmatch_t pmatch[4];
+ int ret_re;
+ char match[255];
+
+ regcomp(&re_asm_singleton1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*\"(.*)\\);",
+ REG_EXTENDED|REG_ICASE);
+ regcomp(&re_asm_singleton2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*\"(.*)\\);",
+ REG_EXTENDED|REG_ICASE);
+
+ regcomp(&re_asm_block_open1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)",
+ REG_EXTENDED|REG_ICASE);
+ regcomp(&re_asm_block_open2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)",
+ REG_EXTENDED|REG_ICASE);
+ regcomp(&re_asm_block_close, ".*\\);",
+ REG_EXTENDED|REG_ICASE);
+
+ if (!sourcefile->mode)
+ line = fgets(line, len, sourcefile->file);
+ else {
+ do {
+ line = fgets(line, len, sourcefile->file);
+ if (!line)
+ break; // end of file
+
+ // already in a multiblock?
+ if (sourcefile->inside_asm) {
+ ret_re = regexec(&re_asm_block_close, line,
+ (sizeof(pmatch)/sizeof(regmatch_t)),
+ pmatch, 0);
+ if (0 == ret_re) {
+ sourcefile->inside_asm = 0;
+ } else
+ break; // use current line
+
+ }
+
+ // try matching a singleton
+ ret_re = regexec(&re_asm_singleton1, line,
+ (sizeof(pmatch)/sizeof(regmatch_t)),
+ pmatch, 0);
+
+ if (0 != ret_re) {
+ ret_re = regexec(&re_asm_singleton2, line,
+ (sizeof(pmatch)/sizeof(regmatch_t)),
+ pmatch, 0);
+ }
+
+ if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) {
+ strncpy(match, line+pmatch[1].rm_so,
+ pmatch[1].rm_eo-pmatch[1].rm_so);
+ match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0';
+ strncpy(line, match, strlen(match)+1);
+
+ break; // use current line
+ }
+
+ // try multiblock option
+ ret_re = regexec(&re_asm_block_open1, line,
+ (sizeof(pmatch)/sizeof(regmatch_t)),
+ pmatch, 0);
+ if (0 != ret_re)
+ ret_re = regexec(&re_asm_block_open2, line,
+ (sizeof(pmatch)/sizeof(regmatch_t)),
+ pmatch, 0);
+
+ if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) {
+ strncpy(match, line+pmatch[1].rm_so,
+ pmatch[1].rm_eo-pmatch[1].rm_so);
+ match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0';
+ strncpy(line, match, strlen(match)+1);
+
+ sourcefile->inside_asm = 1;
+
+ break; // use current line
+ }
+
+ } while (line);
+
+ //if (line && sourcefile->inside_asm) {
+ // printf("asm: %s\n", (char *)line);
+ //}
+
+ }
+
+ return line;
+}
+
+/*
+ * scan_sourcefile - open a file and scan it
+ */
+int scan_sourcefile(char *source_file_name) {
+
+ struct sourcefile sourcefile;
+ char input_line[255];
+ char *line;
+ int classifier_totals[sizeof(classifiers)/sizeof(struct classifier)];
+ int winning_classifier;
+ const struct classifier *classifier;
+ int classifier_number;
+ int overall_classifier;
+ int overall_total;
+
+ for (classifier_number=0,classifier=classifiers;
+ classifier->fn;
+ classifier++,classifier_number++) {
+ classifier_totals[classifier_number] = 0;
+ }
+
+ sourcefile.mode = 0;
+ sourcefile.inside_asm = 0;
+ sourcefile.file_name = source_file_name;
+ sourcefile.file_ext = strrchr(sourcefile.file_name, '.')+1;
+
+ if ( 0 == strncasecmp(sourcefile.file_ext,"S", strlen("S"))) {
+ // assembly source file
+ sourcefile.mode = 0;
+ }
+ if (( 0 == strncasecmp(sourcefile.file_ext,"C", strlen("C"))) ||
+ ( 0 == strncasecmp(sourcefile.file_ext,"H", strlen("H")))) {
+ // C source file
+ sourcefile.mode = 1;
+ }
+
+ sourcefile.file = fopen(sourcefile.file_name, "r");
+
+ while (!feof(sourcefile.file)) {
+ line = next_sourceline(&sourcefile, input_line,
+ sizeof(input_line));
+ if (!line) {
+ if (!feof(sourcefile.file)) {
+ printf("error reading file\n");
+ exit(1);
+ } else
+ break;
+ }
+
+ winning_classifier = scan_sourceline(input_line);
+ //if (winning_classifier >= 0) {
+ // JCM - DEBUG
+ //printf("classifier: %s\n", classifiers[winning_classifier].name);
+ //printf("line: %s\n", input_line);
+ //}
+
+ if (winning_classifier >= 0) {
+ classifier_totals[winning_classifier]++;
+ }
+
+ }
+
+ overall_classifier = -1;
+ overall_total = -1;
+ for (classifier_number=0,classifier=classifiers;
+ classifier->fn;
+ classifier++,classifier_number++) {
+ if (classifier_totals[classifier_number] > overall_total) {
+ overall_classifier = classifier_number;
+ overall_total = classifier_totals[classifier_number];
+ }
+ }
+
+ if (overall_total > CLASSIFIER_THRESHOLD)
+ return overall_classifier;
+ else
+ return -1; // maybe not sure
+}
+
+/*
+ * main - entry point
+ */
+int main(int argc, char **argv)
+{
+
+ int winning_classifier;
+ char *source_file_name = argv[1];
+ char *source_file_ext;
+
+ if (argc != 2) {
+ usage(argv[0]);
+ exit(1);
+ }
+
+ source_file_ext = strrchr(source_file_name, '.');
+
+ if (!source_file_ext) {
+ printf("error: must be run on C or assembly source files\n");
+ usage(argv[0]);
+ exit(1);
+ }
+
+ winning_classifier = scan_sourcefile(source_file_name);
+
+ if (winning_classifier >= 0) {
+ printf("%s: %s\n",
+ source_file_name, classifiers[winning_classifier].name);
+ } else {
+ printf("%s: unknown\n", source_file_name);
+ }
+
+ exit(0);
+}