/* * whichasm - which assembly language does this file use? * * This is a not particularly intelligent tool for attempting to detect * the assembly language used within a particular source file, using a * simple heuristic that the most popular language wins. We don't (yet) * handle multiple assembly languages in a given file, but we could. * * Copyright (C) 2012 Jon Masters * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 (only) of the GNU General * Public License as published by the Free Software Foundation. */ #include #include #include #include #include #include #include "classifier.h" #define CLASSIFIER_THRESHOLD 1 typedef int (*classifier_fn)(char *); struct classifier { classifier_fn fn; const char *name; }; static const struct classifier classifiers[] = { { classifier_arm, "arm" }, { classifier_ppc, "ppc" }, { classifier_s390x, "s390x" }, { classifier_x86, "x86" }, { NULL, NULL } }; struct classifier_scores { int mnemonic_index; int register_index; int score; }; /* * tokenize - split the input line into a list of simple tokens * @source - a line of "source" code from the input file */ struct token *tokenize(char *source) { int this_char, source_len, word_start, in_word, in_space; struct token *tokens, *token, *last_token; tokens = NULL; token = NULL; last_token = NULL; word_start = in_word = 0; in_space = 1; // start off thinking we're at a separator source_len = strlen(source); for (this_char = 0; this_char < source_len; this_char++) { // state transition into a new word if ((isalnum(source[this_char])) || ('%' == source[this_char])) { if (in_space) { if (!in_word) { word_start = this_char; } in_word = 1; in_space = 0; } } else { // state transition out of a word if ((isspace(source[this_char])) || (',' == source[this_char])) in_space = 1; else in_space = 0; if (in_word && (this_char > word_start)) { token = malloc(sizeof(token)); if (!token) { printf("error allocating memory\n"); exit(1); } token->name = malloc(this_char-word_start+1); if (!token->name) { printf("error allocating memory\n"); exit(1); } token->next = NULL; strncpy(token->name, &source[word_start], this_char-word_start); token->name[this_char-word_start]='\0'; if (tokens) last_token->next = token; else tokens = token; last_token = token; } in_word = 0; } } // handle last or sole word specially if ((in_word) && (this_char > word_start)) { token = malloc(sizeof(token)); if (!token) { printf("error allocating memory\n"); exit(1); } token->name = malloc(this_char-word_start+1); if (!token->name) { printf("error allocating memory\n"); exit(1); } token->next = NULL; strncpy(token->name, &source[word_start], this_char-word_start); token->name[this_char-word_start] = '\0'; if (tokens) last_token->next = token; else tokens = token; last_token = token; } return tokens; } /* * free_tokens - free allocated list elements * @tokens - the list of tokens */ int free_tokens(struct token *tokens) { struct token *next_tokens; while (tokens) { next_tokens = tokens->next; free(tokens->name); free(tokens); tokens = next_tokens; } return 0; } /* * scan_tokens - parse tokens for assembly use * @tokens - an input source line of tokens */ int scan_tokens(struct token *tokens) { struct token *token; int token_index, class; int classifier_number; const struct classifier *classifier; struct classifier_scores classifier_scores[sizeof(classifiers)/ sizeof(struct classifier)]; int winning_classifier; int winning_score; for (classifier_number=0,classifier=classifiers; classifier->fn; classifier++,classifier_number++) { classifier_scores[classifier_number].mnemonic_index = -1; classifier_scores[classifier_number].register_index = -1; classifier_scores[classifier_number].score = 0; for (token_index=0,token=tokens; token!=NULL; token=token->next,token_index++) { // special case ignore '%' signs (e.g. registers) if (strstr(token->name,"%") == token->name) class = classifier->fn(&token->name[1]); else class = classifier->fn(token->name); if (class == MNEMONIC) { classifier_scores[classifier_number].score++; classifier_scores[classifier_number].mnemonic_index = token_index; } if (class == REGISTER) { classifier_scores[classifier_number].score++; if (!classifier_scores[classifier_number].register_index) classifier_scores[classifier_number].register_index = token_index; } } if (classifier_scores[classifier_number].mnemonic_index < 0) // No opcode was found - probably not a match classifier_scores[classifier_number].score = 0; if ((classifier_scores[classifier_number].register_index > 0) && (classifier_scores[classifier_number].register_index < classifier_scores[classifier_number].mnemonic_index)) // Register came before opcode - probably not a match classifier_scores[classifier_number].score = 0; } winning_classifier = -1; winning_score = -1; for (classifier_number=0,classifier=classifiers; classifier->fn; classifier++,classifier_number++) { //printf("classifier %s score: %d\n", // classifier->name, // classifier_scores[classifier_number].score); if ((classifier_scores[classifier_number].score) && (classifier_scores[classifier_number].score > winning_score)) { winning_score = classifier_scores[classifier_number].score; winning_classifier = classifier_number; } } return winning_classifier; } /* * excluded_sourceline - filter out comment lines and assembler commands * @source - a line of source (pre-tokenization) * TODO: Implement this function */ int excluded_sourceline(char *source) { //regex_t re_comment; return 0; } /* * scan_sourceline - parse one line of input (file) source * source - textual string representation of an input line */ int scan_sourceline(char *source) { struct token *tokens; int winning_classifier = -1; if (excluded_sourceline(source)) return -1; tokens = tokenize(source); winning_classifier = scan_tokens(tokens); free_tokens(tokens); return winning_classifier; } /* * usage - print a usage message * @progname - name of the program as executed */ int usage(char *progname) { printf("Usage: %s [FILE]\n", progname); printf("Scan the input file for known assembly languages\n"); return 0; } /* Used to store state during file scanning */ struct sourcefile { char *file_name; char *file_ext; FILE *file; int mode; int inside_asm; }; /* * next_sourceline - get the next line to process * @file - the open file object * @mode - 0 returns every line, 1 skips asm sections */ char *next_sourceline(struct sourcefile *sourcefile, void *line, size_t len) { regex_t re_asm_singleton1; regex_t re_asm_singleton2; regex_t re_asm_block_open1; regex_t re_asm_block_open2; regex_t re_asm_block_close; regmatch_t pmatch[4]; int ret_re; char match[255]; regcomp(&re_asm_singleton1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*\"(.*)\\);", REG_EXTENDED|REG_ICASE); regcomp(&re_asm_singleton2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*\"(.*)\\);", REG_EXTENDED|REG_ICASE); regcomp(&re_asm_block_open1, "^.*[_]*asm[_]*[ \t]*[_]*volatile[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)", REG_EXTENDED|REG_ICASE); regcomp(&re_asm_block_open2, "^.*[_]*asm[_]*[ \t]*\\([ \t]*[\"]*[ \t]*(.*)", REG_EXTENDED|REG_ICASE); regcomp(&re_asm_block_close, ".*\\);", REG_EXTENDED|REG_ICASE); if (!sourcefile->mode) line = fgets(line, len, sourcefile->file); else { do { line = fgets(line, len, sourcefile->file); if (!line) break; // end of file // already in a multiblock? if (sourcefile->inside_asm) { ret_re = regexec(&re_asm_block_close, line, (sizeof(pmatch)/sizeof(regmatch_t)), pmatch, 0); if (0 == ret_re) { sourcefile->inside_asm = 0; } else break; // use current line } // try matching a singleton ret_re = regexec(&re_asm_singleton1, line, (sizeof(pmatch)/sizeof(regmatch_t)), pmatch, 0); if (0 != ret_re) { ret_re = regexec(&re_asm_singleton2, line, (sizeof(pmatch)/sizeof(regmatch_t)), pmatch, 0); } if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) { strncpy(match, line+pmatch[1].rm_so, pmatch[1].rm_eo-pmatch[1].rm_so); match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0'; strncpy(line, match, strlen(match)+1); break; // use current line } // try multiblock option ret_re = regexec(&re_asm_block_open1, line, (sizeof(pmatch)/sizeof(regmatch_t)), pmatch, 0); if (0 != ret_re) ret_re = regexec(&re_asm_block_open2, line, (sizeof(pmatch)/sizeof(regmatch_t)), pmatch, 0); if ((0 == ret_re) && (-1 != pmatch[1].rm_so)) { strncpy(match, line+pmatch[1].rm_so, pmatch[1].rm_eo-pmatch[1].rm_so); match[pmatch[1].rm_eo-pmatch[1].rm_so] = '\0'; strncpy(line, match, strlen(match)+1); sourcefile->inside_asm = 1; break; // use current line } } while (line); //if (line && sourcefile->inside_asm) { // printf("asm: %s\n", (char *)line); //} } return line; } /* * scan_sourcefile - open a file and scan it */ int scan_sourcefile(char *source_file_name) { struct sourcefile sourcefile; char input_line[255]; char *line; int classifier_totals[sizeof(classifiers)/sizeof(struct classifier)]; int winning_classifier; const struct classifier *classifier; int classifier_number; int overall_classifier; int overall_total; for (classifier_number=0,classifier=classifiers; classifier->fn; classifier++,classifier_number++) { classifier_totals[classifier_number] = 0; } sourcefile.mode = 0; sourcefile.inside_asm = 0; sourcefile.file_name = source_file_name; sourcefile.file_ext = strrchr(sourcefile.file_name, '.')+1; if ( 0 == strncasecmp(sourcefile.file_ext,"S", strlen("S"))) { // assembly source file sourcefile.mode = 0; } if (( 0 == strncasecmp(sourcefile.file_ext,"C", strlen("C"))) || ( 0 == strncasecmp(sourcefile.file_ext,"H", strlen("H")))) { // C source file sourcefile.mode = 1; } sourcefile.file = fopen(sourcefile.file_name, "r"); while (!feof(sourcefile.file)) { line = next_sourceline(&sourcefile, input_line, sizeof(input_line)); if (!line) { if (!feof(sourcefile.file)) { printf("error reading file\n"); exit(1); } else break; } winning_classifier = scan_sourceline(input_line); //if (winning_classifier >= 0) { // JCM - DEBUG //printf("classifier: %s\n", classifiers[winning_classifier].name); //printf("line: %s\n", input_line); //} if (winning_classifier >= 0) { classifier_totals[winning_classifier]++; } } overall_classifier = -1; overall_total = -1; for (classifier_number=0,classifier=classifiers; classifier->fn; classifier++,classifier_number++) { if (classifier_totals[classifier_number] > overall_total) { overall_classifier = classifier_number; overall_total = classifier_totals[classifier_number]; } } if (overall_total > CLASSIFIER_THRESHOLD) return overall_classifier; else return -1; // maybe not sure } /* * main - entry point */ int main(int argc, char **argv) { int winning_classifier; char *source_file_name = argv[1]; char *source_file_ext; if (argc != 2) { usage(argv[0]); exit(1); } source_file_ext = strrchr(source_file_name, '.'); if (!source_file_ext) { printf("error: must be run on C or assembly source files\n"); usage(argv[0]); exit(1); } winning_classifier = scan_sourcefile(source_file_name); if (winning_classifier >= 0) { printf("%s: %s\n", source_file_name, classifiers[winning_classifier].name); } else { printf("%s: unknown\n", source_file_name); } exit(0); }