diff options
Diffstat (limited to 'whichasm-0.01/whichasm.c')
-rw-r--r-- | whichasm-0.01/whichasm.c | 498 |
1 files changed, 498 insertions, 0 deletions
diff --git a/whichasm-0.01/whichasm.c b/whichasm-0.01/whichasm.c new file mode 100644 index 0000000..f305b6a --- /dev/null +++ b/whichasm-0.01/whichasm.c @@ -0,0 +1,498 @@ +/* + * whichasm - which assembly language does this file use? + * + * This is a not particularly intelligent tool for attempting to detect + * the assembly language used within a particular source file, using a + * simple heuristic that the most popular language wins. We don't (yet) + * handle multiple assembly languages in a given file, but we could. + * + * Copyright (C) 2012 Jon Masters <jcm@jonmasters.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 (only) of the GNU General + * Public License as published by the Free Software Foundation. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <sys/types.h> +#include <regex.h> + +#include "classifier.h" + +#define CLASSIFIER_THRESHOLD 1 + +typedef int (*classifier_fn)(char *); + +struct classifier +{ + classifier_fn fn; + const char *name; +}; + +static const struct classifier classifiers[] = { + { classifier_arm, "arm" }, + { classifier_ppc, "ppc" }, + { classifier_s390x, "s390x" }, + { classifier_x86, "x86" }, + { NULL, NULL } +}; + +struct classifier_scores { + int mnemonic_index; + int register_index; + int score; +}; + +/* + * tokenize - split the input line into a list of simple tokens + * @source - a line of "source" code from the input file + */ +struct token *tokenize(char *source) +{ + + int this_char, source_len, word_start, in_word, in_space; + struct token *tokens, *token, *last_token; + + tokens = NULL; + token = NULL; + last_token = NULL; + + word_start = in_word = 0; + in_space = 1; // start off thinking we're at a separator + source_len = strlen(source); + for (this_char = 0; this_char < source_len; this_char++) { + // state transition into a new word + if ((isalnum(source[this_char])) || + ('%' == source[this_char])) { + if (in_space) { + if (!in_word) { + word_start = this_char; + } + in_word = 1; + in_space = 0; + } + } else { + // state transition out of a word + if ((isspace(source[this_char])) || + (',' == source[this_char])) + in_space = 1; + else + in_space = 0; + + if (in_word && (this_char > word_start)) { + token = malloc(sizeof(token)); + if (!token) { + printf("error allocating memory\n"); + exit(1); + } + token->name = malloc(this_char-word_start+1); + if (!token->name) { + printf("error allocating memory\n"); + exit(1); + } + token->next = NULL; + strncpy(token->name, + &source[word_start], + this_char-word_start); + token->name[this_char-word_start]='\0'; + + if (tokens) + last_token->next = token; + else + tokens = token; + last_token = token; + } + in_word = 0; + } + } + + // handle last or sole word specially + if ((in_word) && (this_char > word_start)) { + token = malloc(sizeof(token)); + if (!token) { + printf("error allocating memory\n"); + exit(1); + } + token->name = malloc(this_char-word_start+1); + if (!token->name) { + printf("error allocating memory\n"); + exit(1); + } + token->next = NULL; + strncpy(token->name, &source[word_start], this_char-word_start); + token->name[this_char-word_start] = '\0'; + + if (tokens) + last_token->next = token; + else + tokens = token; + last_token = token; + } + + return tokens; +} + +/* + * free_tokens - free allocated list elements + * @tokens - the list of tokens + */ +int free_tokens(struct token *tokens) +{ + struct token *next_tokens; + + while (tokens) { + next_tokens = tokens->next; + free(tokens->name); + free(tokens); + tokens = next_tokens; + } + + return 0; +} + +/* + * scan_tokens - parse tokens for assembly use + * @tokens - an input source line of tokens + */ +int scan_tokens(struct token *tokens) +{ + struct token *token; + int token_index, class; + int classifier_number; + const struct classifier *classifier; + struct classifier_scores classifier_scores[sizeof(classifiers)/ + sizeof(struct classifier)]; + int winning_classifier; + int winning_score; + + for (classifier_number=0,classifier=classifiers; + classifier->fn; + classifier++,classifier_number++) { + + classifier_scores[classifier_number].mnemonic_index = -1; + classifier_scores[classifier_number].register_index = -1; + classifier_scores[classifier_number].score = 0; + + for (token_index=0,token=tokens; + token!=NULL; + token=token->next,token_index++) { + + // special case ignore '%' signs (e.g. registers) + if (strstr(token->name,"%") == token->name) + class = classifier->fn(&token->name[1]); + else + class = classifier->fn(token->name); + if (class == MNEMONIC) { + classifier_scores[classifier_number].score++; + classifier_scores[classifier_number].mnemonic_index = token_index; + } + if (class == REGISTER) { + classifier_scores[classifier_number].score++; + if (!classifier_scores[classifier_number].register_index) + classifier_scores[classifier_number].register_index = token_index; + } + + } + + if (classifier_scores[classifier_number].mnemonic_index < 0) + // No opcode was found - probably not a match + classifier_scores[classifier_number].score = 0; + if ((classifier_scores[classifier_number].register_index > 0) && + (classifier_scores[classifier_number].register_index < + classifier_scores[classifier_number].mnemonic_index)) + // Register came before opcode - probably not a match + classifier_scores[classifier_number].score = 0; + + } + + winning_classifier = -1; + winning_score = -1; + + for (classifier_number=0,classifier=classifiers; + classifier->fn; + classifier++,classifier_number++) { + + //printf("classifier %s score: %d\n", + // classifier->name, + // classifier_scores[classifier_number].score); + + if ((classifier_scores[classifier_number].score) && + (classifier_scores[classifier_number].score > + winning_score)) + { + winning_score = + classifier_scores[classifier_number].score; + winning_classifier = classifier_number; + } + } + + return winning_classifier; +} + +/* + * excluded_sourceline - filter out comment lines and assembler commands + * @source - a line of source (pre-tokenization) + * TODO: Implement this function + */ +int excluded_sourceline(char *source) +{ + //regex_t re_comment; + + return 0; +} + +/* + * scan_sourceline - parse one line of input (file) source + * source - textual string representation of an input line + */ +int scan_sourceline(char *source) +{ + struct token *tokens; + int winning_classifier = -1; + + if (excluded_sourceline(source)) + return -1; + + tokens = tokenize(source); + winning_classifier = scan_tokens(tokens); + free_tokens(tokens); + + return winning_classifier; +} + +/* + * usage - print a usage message + * @progname - name of the program as executed + */ +int usage(char *progname) { + + printf("Usage: %s [FILE]\n", progname); + printf("Scan the input file for known assembly languages\n"); + + return 0; +} + +/* Used to store state during file scanning */ +struct sourcefile { + char *file_name; + char *file_ext; + FILE *file; + int mode; + int inside_asm; +}; + +/* + * next_sourceline - get the next line to process + * @file - the open file object + * @mode - 0 returns every line, 1 skips asm sections + */ +char *next_sourceline(struct sourcefile *sourcefile, void *line, size_t len) { + + regex_t re_asm_singleton1; + regex_t re_asm_singleton2; + regex_t re_asm_block_open1; + regex_t re_asm_block_open2; + regex_t re_asm_block_close; + regmatch_t pmatch[4]; + int ret_re; + char match[255]; + + regcomp(&re_asm_singleton1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*\"(.*)\\);", + REG_EXTENDED|REG_ICASE); + regcomp(&re_asm_singleton2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*\"(.*)\\);", + REG_EXTENDED|REG_ICASE); + + regcomp(&re_asm_block_open1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)", + REG_EXTENDED|REG_ICASE); + regcomp(&re_asm_block_open2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)", + REG_EXTENDED|REG_ICASE); + regcomp(&re_asm_block_close, ".*\\);", + REG_EXTENDED|REG_ICASE); + + if (!sourcefile->mode) + line = fgets(line, len, sourcefile->file); + else { + do { + line = fgets(line, len, sourcefile->file); + if (!line) + break; // end of file + + // already in a multiblock? + if (sourcefile->inside_asm) { + ret_re = regexec(&re_asm_block_close, line, + (sizeof(pmatch)/sizeof(regmatch_t)), + pmatch, 0); + if (0 == ret_re) { + sourcefile->inside_asm = 0; + } else + break; // use current line + + } + + // try matching a singleton + ret_re = regexec(&re_asm_singleton1, line, + (sizeof(pmatch)/sizeof(regmatch_t)), + pmatch, 0); + + if (0 != ret_re) { + ret_re = regexec(&re_asm_singleton2, line, + (sizeof(pmatch)/sizeof(regmatch_t)), + pmatch, 0); + } + + if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) { + strncpy(match, line+pmatch[1].rm_so, + pmatch[1].rm_eo-pmatch[1].rm_so); + match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0'; + strncpy(line, match, strlen(match)+1); + + break; // use current line + } + + // try multiblock option + ret_re = regexec(&re_asm_block_open1, line, + (sizeof(pmatch)/sizeof(regmatch_t)), + pmatch, 0); + if (0 != ret_re) + ret_re = regexec(&re_asm_block_open2, line, + (sizeof(pmatch)/sizeof(regmatch_t)), + pmatch, 0); + + if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) { + strncpy(match, line+pmatch[1].rm_so, + pmatch[1].rm_eo-pmatch[1].rm_so); + match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0'; + strncpy(line, match, strlen(match)+1); + + sourcefile->inside_asm = 1; + + break; // use current line + } + + } while (line); + + //if (line && sourcefile->inside_asm) { + // printf("asm: %s\n", (char *)line); + //} + + } + + return line; +} + +/* + * scan_sourcefile - open a file and scan it + */ +int scan_sourcefile(char *source_file_name) { + + struct sourcefile sourcefile; + char input_line[255]; + char *line; + int classifier_totals[sizeof(classifiers)/sizeof(struct classifier)]; + int winning_classifier; + const struct classifier *classifier; + int classifier_number; + int overall_classifier; + int overall_total; + + for (classifier_number=0,classifier=classifiers; + classifier->fn; + classifier++,classifier_number++) { + classifier_totals[classifier_number] = 0; + } + + sourcefile.mode = 0; + sourcefile.inside_asm = 0; + sourcefile.file_name = source_file_name; + sourcefile.file_ext = strrchr(sourcefile.file_name, '.')+1; + + if ( 0 == strncasecmp(sourcefile.file_ext,"S", strlen("S"))) { + // assembly source file + sourcefile.mode = 0; + } + if (( 0 == strncasecmp(sourcefile.file_ext,"C", strlen("C"))) || + ( 0 == strncasecmp(sourcefile.file_ext,"H", strlen("H")))) { + // C source file + sourcefile.mode = 1; + } + + sourcefile.file = fopen(sourcefile.file_name, "r"); + + while (!feof(sourcefile.file)) { + line = next_sourceline(&sourcefile, input_line, + sizeof(input_line)); + if (!line) { + if (!feof(sourcefile.file)) { + printf("error reading file\n"); + exit(1); + } else + break; + } + + winning_classifier = scan_sourceline(input_line); + //if (winning_classifier >= 0) { + // JCM - DEBUG + //printf("classifier: %s\n", classifiers[winning_classifier].name); + //printf("line: %s\n", input_line); + //} + + if (winning_classifier >= 0) { + classifier_totals[winning_classifier]++; + } + + } + + overall_classifier = -1; + overall_total = -1; + for (classifier_number=0,classifier=classifiers; + classifier->fn; + classifier++,classifier_number++) { + if (classifier_totals[classifier_number] > overall_total) { + overall_classifier = classifier_number; + overall_total = classifier_totals[classifier_number]; + } + } + + if (overall_total > CLASSIFIER_THRESHOLD) + return overall_classifier; + else + return -1; // maybe not sure +} + +/* + * main - entry point + */ +int main(int argc, char **argv) +{ + + int winning_classifier; + char *source_file_name = argv[1]; + char *source_file_ext; + + if (argc != 2) { + usage(argv[0]); + exit(1); + } + + source_file_ext = strrchr(source_file_name, '.'); + + if (!source_file_ext) { + printf("error: must be run on C or assembly source files\n"); + usage(argv[0]); + exit(1); + } + + winning_classifier = scan_sourcefile(source_file_name); + + if (winning_classifier >= 0) { + printf("%s: %s\n", + source_file_name, classifiers[winning_classifier].name); + } else { + printf("%s: unknown\n", source_file_name); + } + + exit(0); +} |