diff options
Diffstat (limited to 'feed/sse_feed.py')
-rwxr-xr-x | feed/sse_feed.py | 121 |
1 files changed, 69 insertions, 52 deletions
diff --git a/feed/sse_feed.py b/feed/sse_feed.py index c6f0de9..d31d826 100755 --- a/feed/sse_feed.py +++ b/feed/sse_feed.py @@ -1,81 +1,98 @@ #!/usr/bin/python -import sys, os, MySQLdb, stat +import sys, os, stat, string from popen2 import Popen3 -supported = [".c", ".h"] +import sse_db +from sse_config import * +from sse_defs import * -def supported_source(fn): +def camel_split(word): - for e in supported: - if fn.endswith(e): - return True + if len(word) <= 0: + return [] - return False + r = [] + last = 0 -def last_insert_id(cursor): - cursor.execute("SELECT LAST_INSERT_ID()"); - return cursor.fetchone()[0] + for i in range(0, len(word)-1): + + if word[i].islower() and word[i+1].isupper(): + r.append(word[last:i+1]) + last = i+1 -def process_file(package_id, root, path): - global cursor - print "Processing %s" % path + r.append(word[last:]) - cursor.execute("INSERT INTO file (package_id, path, language_id) VALUES (%i, '%s', '0')" % (package_id, path)); + return r + - file_id = last_insert_id(cursor); +def default_subword_split(word): + r = [] + + w = word.split("_") - p = Popen3("lex-c %s" % (os.path.join(root, path))) + if len(w) > 1: + delimiter = "_" + else: + w = camel_split(word) + delimiter = "" - for identifier in p.fromchild: - text = identifier.strip() - - cursor.execute("INSERT IGNORE INTO word (text, type, file_id) VALUES ('%s', 'word', '%i')" % (text, file_id)) - cursor.execute("UPDATE word SET cnt=cnt+1 WHERE text='%s' AND type='word' AND file_id=%i" % (text, file_id)) + if len(w) > 1: - if p.wait() != 0: - print "WARNING: Subprocess failed!" + for i in range(1, len(w)): - del p + if len(w[i]) == 0: + continue + + n = string.join(w[i:], delimiter) -def handle_file(package_id, root, path, filename): + if len(n) >= 4: + r.append(n) - t = sys.lstat(os.path.join(path, filename)) + return r + +supported_languages = [ { + "extensions" : [".c", ".h", ".cc", ".hh", ".cpp", ".hpp"], + "subword_split" : default_subword_split, + "lexer" : SSE_DIR+"/sse_lex_c", + "language_id" : SSE_LANGUAGE_C + }] - if stat.F_ISREG(t.st_mode): +def find_language(fn): + + for l in supported_languages: + for e in l["extensions"]: + if fn.lower().endswith(e): + return l - extension = filename.split(".")[-1] + return None - if extension in ("c", "h"): - process_file(package_id, root, os.path.join(path, filename)) - return +def supported_source(fn): + return not find_language(fn) is None - os.unlink(os.path.join(root, path, filename)) +def process_source(archive, root, path, package_id, meta): + print "(%s) Processing %s" % (archive, path) -def handle_tree(path, name, url, md): - global cursor + language = find_language(path) - cursor.execute("INSERT INTO package (path, name, url, timestamp, md) VALUES ('%s', '%s', '%s', NOW(), '%s')" % (path + "/%s", name, url, md)); - package_id = last_insert_id(cursor); + assert not language is None - path = os.path.realpath(path) - - for dirpath, dirs, files in os.walk(path): - for f in files: - assert path + "/" == (dirpath + "/") [:len(path)+1] + file_id = sse_db.new_file(package_id, path, language["language_id"]) - handle_file(package_id, path, dirpath[len(path)+1:], f) + p = Popen3("%s %s" % (language["lexer"], os.path.join(root, path))) -if __name__ == "__main__": - db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse") - cursor = db.cursor(); - cursor.execute("SET AUTOCOMMIT=0") - cursor.execute("START TRANSACTION") + subword_split = language["subword_split"] - assert len(sys.argv) == 5 + for identifier in p.fromchild: + + text = identifier.strip() + sse_db.new_word(file_id, text, False) - handle_tree(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) + subwords = subword_split(text) + for w in subwords: + sse_db.new_word(file_id, w, True) + + if p.wait() != 0: + print "WARNING: Subprocess failed!" - cursor.execute("COMMIT") - cursor.close() - db.close() + del p |