#!/usr/bin/python import sys, os, stat, string from popen2 import Popen3 import sse_db from sse_config import * from sse_defs import * def camel_split(word): if len(word) <= 0: return [] r = [] last = 0 for i in range(0, len(word)-1): if word[i].islower() and word[i+1].isupper(): r.append(word[last:i+1]) last = i+1 r.append(word[last:]) return r def default_subword_split(word): r = [] w = word.split("_") if len(w) > 1: delimiter = "_" else: w = camel_split(word) delimiter = "" if len(w) > 1: for i in range(1, len(w)): if len(w[i]) == 0: continue n = string.join(w[i:], delimiter) if len(n) >= 4: r.append(n) return r supported_languages = [ { "extensions" : [".c", ".h", ".cc", ".hh", ".cpp", ".hpp"], "subword_split" : default_subword_split, "lexer" : SSE_DIR+"/sse_lex_c", "language_id" : SSE_LANGUAGE_C }] def find_language(fn): for l in supported_languages: for e in l["extensions"]: if fn.lower().endswith(e): return l return None def supported_source(fn): return not find_language(fn) is None def process_source(archive, root, path, package_id, meta): print "(%s) Processing %s" % (archive, path) language = find_language(path) assert not language is None file_id = sse_db.new_file(package_id, path, language["language_id"]) p = Popen3("%s %s" % (language["lexer"], os.path.join(root, path))) subword_split = language["subword_split"] for identifier in p.fromchild: text = identifier.strip() sse_db.new_word(file_id, text, False) subwords = subword_split(text) for w in subwords: sse_db.new_word(file_id, w, True) if p.wait() != 0: print "WARNING: Subprocess failed!" del p