From 0e1c2be3780aa6308185af98ed3d178f1667a9b9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 22 Nov 2005 23:45:32 +0000 Subject: add support for multiple crawlers git-svn-id: file:///home/lennart/svn/public/sse/trunk@24 5fbabb74-0606-0410-a5e4-b5cc6a42724e --- feed/sse_config.py | 7 +++++++ feed/sse_db.py | 22 +++++++++++----------- feed/sse_tar.py | 21 ++++++++++----------- sse.sql | 33 ++++++++++++++++++++++++++++----- 4 files changed, 56 insertions(+), 27 deletions(-) diff --git a/feed/sse_config.py b/feed/sse_config.py index 40d631f..2dfdbd7 100644 --- a/feed/sse_config.py +++ b/feed/sse_config.py @@ -2,4 +2,11 @@ import os HOME = os.environ["HOME"] + SSE_DIR = "/home/lennart/sse/feed" +SSE_CRAWLER_ID = 1 + +SSE_DB_HOST = "localhost" +SSE_DB_USER= "sse_web" +SSE_DB_PASSWORD = "ece6Yoli" +SSE_DB_DATABASE = "sse" diff --git a/feed/sse_db.py b/feed/sse_db.py index e99c5d0..6a0f7e6 100644 --- a/feed/sse_db.py +++ b/feed/sse_db.py @@ -1,7 +1,8 @@ import sys, os, MySQLdb, stat +from sse_config import * -db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse") +db = MySQLdb.connect(SSE_DB_HOST, SSE_DB_USER, SSE_DB_PASSWORD, SSE_DB_DATABASE) cursor = db.cursor(); def commit(): @@ -19,18 +20,18 @@ def last_insert_id(): def new_package(archive, root, meta): - cursor.execute('INSERT INTO package (path, timestamp, md) VALUES (%s, NOW(), %s)', (root + '/%s', meta["md"])) + cursor.execute('INSERT INTO package (crawler_id, path, timestamp, md) VALUES (%s, %s, NOW(), %s)', (SSE_CRAWLER_ID, root + '/%s', meta["md"])) - return last_insert_id(); + return (SSE_CRAWLER_ID, last_insert_id()) def find_package(md): - cursor.execute('SELECT id FROM package WHERE md=%s', md) + cursor.execute('SELECT crawler_id, id FROM package WHERE md=%s', md) if cursor.rowcount <= 0: return None - return int(cursor.fetchone()[0]) + return (int(cursor.fetchone()[0]), int(cursor.fetchone()[1])) def new_provider_record(recid, package_id, provider_id, meta): @@ -54,14 +55,13 @@ def new_provider_record(recid, package_id, provider_id, meta): except KeyError: l = "" - cursor.execute('REPLACE provider_record (id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s)', (recid, package_id, provider_id, name, url, download_url, l)) - + cursor.execute('REPLACE provider_record (id, crawler_id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (recid, package_id[0], package_id[1], provider_id, name, url, download_url, l)) def new_file(package_id, path, language_id = 0): - cursor.execute('INSERT INTO file (package_id, path, language_id) VALUES (%s, %s, %s)', (package_id, path, language_id)); + cursor.execute('INSERT INTO file (crawler_id, package_id, path, language_id) VALUES (%s, %s, %s)', (package_id[0], package_id[1], path, language_id)); - return last_insert_id() + return (SSE_CRAWLER_ID, last_insert_id()) def new_word(file_id, text, is_subword): @@ -70,5 +70,5 @@ def new_word(file_id, text, is_subword): else: t = "word" - cursor.execute('INSERT IGNORE INTO word (text, type, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, file_id)) - cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND file_id=%s', (text, t, file_id)) + cursor.execute('INSERT IGNORE INTO word (text, type, crawler_id, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, crawler_id, file_id)) + cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND crawler_id=%s AND file_id=%s', (text, t, crawler_id, file_id)) diff --git a/feed/sse_tar.py b/feed/sse_tar.py index bf3cb8e..45700f9 100755 --- a/feed/sse_tar.py +++ b/feed/sse_tar.py @@ -115,12 +115,11 @@ def rm_rf(root): def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER_NONE): if recid is None: - recid = archive + recid = os.path.basename(archive) - md = calc_md(archive) - meta["md"] = md - - root = os.path.join(HOME, "sources", md) + if not meta.has_key("md"): + md = calc_md(archive) + meta["md"] = md sse_db.start_transaction() @@ -128,19 +127,19 @@ def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER package_id = sse_db.find_package(md) if not package_id is None: + print "Package '%s' already in database." % recid - if not recid is None: - # Update provider record - sse_db.new_provider_record(recid, package_id, provider_id, meta) + # Update provider record + sse_db.new_provider_record(recid, package_id, provider_id, meta) else: + root = os.path.join(HOME, "sources", md) + package_id = sse_db.new_package(archive, root, meta) print "Package '%s' is new in database." % recid - - if not recid is None: - sse_db.new_provider_record(recid, package_id, provider_id, meta) + sse_db.new_provider_record(recid, package_id, provider_id, meta) try: rm_rf(root) diff --git a/sse.sql b/sse.sql index b7b1622..2345053 100644 --- a/sse.sql +++ b/sse.sql @@ -4,41 +4,64 @@ DROP TABLE word; DROP TABLE file; DROP TABLE package; DROP TABLE provider_record; +DROP TABLE crawler; CREATE TABLE word ( text VARCHAR(40) NOT NULL, type ENUM ('word', 'subword') DEFAULT 'word' NOT NULL, + crawler_id TINYINT UNSIGNED NOT NULL, file_id INTEGER UNSIGNED NOT NULL, + cnt INTEGER UNSIGNED DEFAULT 0 NOT NULL, - PRIMARY KEY (text, type, file_id) + + PRIMARY KEY (text, type, crawler_id, file_id) ) ENGINE=InnoDB; CREATE TABLE file ( + crawler_id TINYINT UNSIGNED NOT NULL, id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT, + package_id INTEGER UNSIGNED NOT NULL, path VARBINARY(255) NOT NULL, language_id TINYINT UNSIGNED NOT NULL, - PRIMARY KEY (id), - UNIQUE KEY (package_id, path) + + PRIMARY KEY (crawler_id, id), + UNIQUE KEY (crawler_id, package_id, path) ) ENGINE=InnoDB; CREATE TABLE package ( + crawler_id TINYINT UNSIGNED NOT NULL, id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT, + path VARBINARY(255) NOT NULL, timestamp TIMESTAMP NOT NULL, md CHAR(32) NOT NULL DEFAULT '', - PRIMARY KEY(id), + + PRIMARY KEY(crawler_id, id), UNIQUE KEY (md) ) ENGINE=InnoDB; CREATE TABLE provider_record ( id VARBINARY(64) NOT NULL, + + crawler_id TINYINT UNSIGNED NOT NULL, package_id INTEGER UNSIGNED NOT NULL, + provider_id TINYINT UNSIGNED NOT NULL, name VARBINARY(255) NOT NULL DEFAULT 'noname', url VARBINARY(255) NOT NULL DEFAULT '', download_url VARBINARY(255) NOT NULL DEFAULT '', license VARCHAR(64) NOT NULL DEFAULT '', + PRIMARY KEY(id), - UNIQUE KEY (package_id, provider_id) + UNIQUE KEY (crawler_id, package_id, provider_id) +) ENGINE=InnoDB; + +CREATE TABLE crawler ( + id TINYINT UNSIGNED NOT NULL, + + name VARCHAR(255) NOT NULL, + url VARBINATY(255) NOT NULL, + + PRIMARY KEY(id) ) ENGINE=InnoDB; -- cgit