feed/sse_feed.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

#!/usr/bin/python

import sys, os, stat, string
from popen2 import Popen3

import sse_db
from sse_config import *
from sse_defs import *

def camel_split(word):

    if len(word) <= 0:
        return []

    r = []
    last = 0

    for i in range(0, len(word)-1):
        
        if word[i].islower() and word[i+1].isupper():
            r.append(word[last:i+1])
            last = i+1

    r.append(word[last:])

    return r
            

def default_subword_split(word):
    r = []
    
    w = word.split("_")

    if len(w) > 1:
        delimiter = "_"
    else:
        w = camel_split(word)
        delimiter = ""

    if len(w) > 1:

        for i in range(1, len(w)):

            if len(w[i]) == 0:
                continue

            n = string.join(w[i:], delimiter)

            if len(n) >= 4:
                r.append(n)

    return r
                    
supported_languages = [ {
    "extensions" : [".c", ".h", ".cc", ".hh", ".cpp", ".hpp"],
    "subword_split" : default_subword_split,
    "lexer" : SSE_DIR+"/sse_lex_c",
    "language_id" : SSE_LANGUAGE_C
    }]

def find_language(fn):
    
    for l in supported_languages:
        for e in l["extensions"]:
            if fn.lower().endswith(e):
                return l

    return None

def supported_source(fn):
    return not find_language(fn) is None

def process_source(archive, root, path, package_id, meta):
    print "(%s) Processing %s" % (archive, path)

    language = find_language(path)

    assert not language is None

    file_id = sse_db.new_file(package_id, path, language["language_id"])

    p = Popen3("%s %s" % (language["lexer"], os.path.join(root, path)))

    subword_split = language["subword_split"]

    for identifier in p.fromchild:
        
    	text = identifier.strip()
        sse_db.new_word(file_id, text, False)

        subwords = subword_split(text)
        for w in subwords:
            sse_db.new_word(file_id, w, True)
        
    if p.wait() != 0:
        print "WARNING: Subprocess failed!"

    del p