diff options
author | Lennart Poettering <lennart@poettering.net> | 2005-11-22 23:45:32 +0000 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2005-11-22 23:45:32 +0000 |
commit | 0e1c2be3780aa6308185af98ed3d178f1667a9b9 (patch) | |
tree | e92d550c15fd0024f30f7006d68e06c16f147617 /feed | |
parent | 5c593d81a08ff84577c62016b2b2ef803b052614 (diff) |
add support for multiple crawlers
git-svn-id: file:///home/lennart/svn/public/sse/trunk@24 5fbabb74-0606-0410-a5e4-b5cc6a42724e
Diffstat (limited to 'feed')
-rw-r--r-- | feed/sse_config.py | 7 | ||||
-rw-r--r-- | feed/sse_db.py | 22 | ||||
-rwxr-xr-x | feed/sse_tar.py | 21 |
3 files changed, 28 insertions, 22 deletions
diff --git a/feed/sse_config.py b/feed/sse_config.py index 40d631f..2dfdbd7 100644 --- a/feed/sse_config.py +++ b/feed/sse_config.py @@ -2,4 +2,11 @@ import os HOME = os.environ["HOME"] + SSE_DIR = "/home/lennart/sse/feed" +SSE_CRAWLER_ID = 1 + +SSE_DB_HOST = "localhost" +SSE_DB_USER= "sse_web" +SSE_DB_PASSWORD = "ece6Yoli" +SSE_DB_DATABASE = "sse" diff --git a/feed/sse_db.py b/feed/sse_db.py index e99c5d0..6a0f7e6 100644 --- a/feed/sse_db.py +++ b/feed/sse_db.py @@ -1,7 +1,8 @@ import sys, os, MySQLdb, stat +from sse_config import * -db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse") +db = MySQLdb.connect(SSE_DB_HOST, SSE_DB_USER, SSE_DB_PASSWORD, SSE_DB_DATABASE) cursor = db.cursor(); def commit(): @@ -19,18 +20,18 @@ def last_insert_id(): def new_package(archive, root, meta): - cursor.execute('INSERT INTO package (path, timestamp, md) VALUES (%s, NOW(), %s)', (root + '/%s', meta["md"])) + cursor.execute('INSERT INTO package (crawler_id, path, timestamp, md) VALUES (%s, %s, NOW(), %s)', (SSE_CRAWLER_ID, root + '/%s', meta["md"])) - return last_insert_id(); + return (SSE_CRAWLER_ID, last_insert_id()) def find_package(md): - cursor.execute('SELECT id FROM package WHERE md=%s', md) + cursor.execute('SELECT crawler_id, id FROM package WHERE md=%s', md) if cursor.rowcount <= 0: return None - return int(cursor.fetchone()[0]) + return (int(cursor.fetchone()[0]), int(cursor.fetchone()[1])) def new_provider_record(recid, package_id, provider_id, meta): @@ -54,14 +55,13 @@ def new_provider_record(recid, package_id, provider_id, meta): except KeyError: l = "" - cursor.execute('REPLACE provider_record (id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s)', (recid, package_id, provider_id, name, url, download_url, l)) - + cursor.execute('REPLACE provider_record (id, crawler_id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (recid, package_id[0], package_id[1], provider_id, name, url, download_url, l)) def new_file(package_id, path, language_id = 0): - cursor.execute('INSERT INTO file (package_id, path, language_id) VALUES (%s, %s, %s)', (package_id, path, language_id)); + cursor.execute('INSERT INTO file (crawler_id, package_id, path, language_id) VALUES (%s, %s, %s)', (package_id[0], package_id[1], path, language_id)); - return last_insert_id() + return (SSE_CRAWLER_ID, last_insert_id()) def new_word(file_id, text, is_subword): @@ -70,5 +70,5 @@ def new_word(file_id, text, is_subword): else: t = "word" - cursor.execute('INSERT IGNORE INTO word (text, type, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, file_id)) - cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND file_id=%s', (text, t, file_id)) + cursor.execute('INSERT IGNORE INTO word (text, type, crawler_id, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, crawler_id, file_id)) + cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND crawler_id=%s AND file_id=%s', (text, t, crawler_id, file_id)) diff --git a/feed/sse_tar.py b/feed/sse_tar.py index bf3cb8e..45700f9 100755 --- a/feed/sse_tar.py +++ b/feed/sse_tar.py @@ -115,12 +115,11 @@ def rm_rf(root): def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER_NONE): if recid is None: - recid = archive + recid = os.path.basename(archive) - md = calc_md(archive) - meta["md"] = md - - root = os.path.join(HOME, "sources", md) + if not meta.has_key("md"): + md = calc_md(archive) + meta["md"] = md sse_db.start_transaction() @@ -128,19 +127,19 @@ def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER package_id = sse_db.find_package(md) if not package_id is None: + print "Package '%s' already in database." % recid - if not recid is None: - # Update provider record - sse_db.new_provider_record(recid, package_id, provider_id, meta) + # Update provider record + sse_db.new_provider_record(recid, package_id, provider_id, meta) else: + root = os.path.join(HOME, "sources", md) + package_id = sse_db.new_package(archive, root, meta) print "Package '%s' is new in database." % recid - - if not recid is None: - sse_db.new_provider_record(recid, package_id, provider_id, meta) + sse_db.new_provider_record(recid, package_id, provider_id, meta) try: rm_rf(root) |