diff options
author | Lennart Poettering <lennart@poettering.net> | 2005-11-23 03:10:22 +0000 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2005-11-23 03:10:22 +0000 |
commit | 64f14c6f2760f20417df31f543d03a08c59fe988 (patch) | |
tree | 8bc662d9f8b69a973b788252c0bd0b8de8209c03 /feed | |
parent | 3574f03545c00b33620d6c35cf7115310b66d264 (diff) |
* add more licenses
* fix "unsubscriptable object" issue
* DB optimization
git-svn-id: file:///home/lennart/svn/public/sse/trunk@36 5fbabb74-0606-0410-a5e4-b5cc6a42724e
Diffstat (limited to 'feed')
-rw-r--r-- | feed/sse_db.py | 16 | ||||
-rwxr-xr-x | feed/sse_feed.py | 19 | ||||
-rwxr-xr-x | feed/sse_fm.py | 4 |
3 files changed, 26 insertions, 13 deletions
diff --git a/feed/sse_db.py b/feed/sse_db.py index 16b07c7..bd74f15 100644 --- a/feed/sse_db.py +++ b/feed/sse_db.py @@ -30,8 +30,9 @@ def find_package(md): if cursor.rowcount <= 0: return None - - return (int(cursor.fetchone()[0]), int(cursor.fetchone()[1])) + + r = cursor.fetchone() + return (int(r[0]), int(r[1])) def new_provider_record(recid, package_id, provider_id, meta): @@ -63,12 +64,13 @@ def new_file(package_id, path, language_id = 0): return (SSE_CRAWLER_ID, last_insert_id()) -def new_word(file_id, text, is_subword): +def new_word(file_id, text, is_subword, n): if is_subword: - t = "subword" + wtype = "subword" else: - t = "word" + wtype = "word" + + assert n > 0 - cursor.execute('INSERT IGNORE INTO word (text, type, crawler_id, file_id, cnt) VALUES (%s, %s, %s, %s, 0)', (text, t, file_id[0], file_id[1])) - cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND crawler_id=%s AND file_id=%s', (text, t, file_id[0], file_id[1])) + cursor.execute('INSERT INTO word (text, type, crawler_id, file_id, cnt) VALUES (%s, %s, %s, %s, %s)', (text, wtype, file_id[0], file_id[1], n)) diff --git a/feed/sse_feed.py b/feed/sse_feed.py index 7925c25..6a34d69 100755 --- a/feed/sse_feed.py +++ b/feed/sse_feed.py @@ -33,16 +33,25 @@ def process_source(archive, root, path, package_id, meta): file_id = sse_db.new_file(package_id, path, language["language_id"]) + table = {} + p = Popen3("%s %s" % (language["lexer"], os.path.join(root, path))) for identifier in p.fromchild: - - text = identifier.strip() + t = identifier.strip() + + try: + table[t.lower()][1] += 1 + except KeyError: + table[t.lower()] = [t, 1] - if text.startswith("S:"): - sse_db.new_word(file_id, text[2:], True) + for k, v in table.items(): + if v[0].startswith("S:"): + sse_db.new_word(file_id, v[0][2:], True, v[1]) else: - sse_db.new_word(file_id, text, False) + sse_db.new_word(file_id, v[0], False, v[1]) + + del table if p.wait() != 0: print "WARNING: Subprocess failed!" diff --git a/feed/sse_fm.py b/feed/sse_fm.py index 2d79c8c..467ed9f 100755 --- a/feed/sse_fm.py +++ b/feed/sse_fm.py @@ -32,7 +32,9 @@ license_whitelist = [ "BSD License (revised)", "DFSG approved", "GNU Free Documentation License (FDL)", - "W3C License" + "W3C License", + "Mozilla Public License (MPL)", + "The Apache License 2.0" ] def process_record(meta): |