summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2005-11-22 23:45:32 +0000
committerLennart Poettering <lennart@poettering.net>2005-11-22 23:45:32 +0000
commit0e1c2be3780aa6308185af98ed3d178f1667a9b9 (patch)
treee92d550c15fd0024f30f7006d68e06c16f147617
parent5c593d81a08ff84577c62016b2b2ef803b052614 (diff)
add support for multiple crawlers
git-svn-id: file:///home/lennart/svn/public/sse/trunk@24 5fbabb74-0606-0410-a5e4-b5cc6a42724e
-rw-r--r--feed/sse_config.py7
-rw-r--r--feed/sse_db.py22
-rwxr-xr-xfeed/sse_tar.py21
-rw-r--r--sse.sql33
4 files changed, 56 insertions, 27 deletions
diff --git a/feed/sse_config.py b/feed/sse_config.py
index 40d631f..2dfdbd7 100644
--- a/feed/sse_config.py
+++ b/feed/sse_config.py
@@ -2,4 +2,11 @@
import os
HOME = os.environ["HOME"]
+
SSE_DIR = "/home/lennart/sse/feed"
+SSE_CRAWLER_ID = 1
+
+SSE_DB_HOST = "localhost"
+SSE_DB_USER= "sse_web"
+SSE_DB_PASSWORD = "ece6Yoli"
+SSE_DB_DATABASE = "sse"
diff --git a/feed/sse_db.py b/feed/sse_db.py
index e99c5d0..6a0f7e6 100644
--- a/feed/sse_db.py
+++ b/feed/sse_db.py
@@ -1,7 +1,8 @@
import sys, os, MySQLdb, stat
+from sse_config import *
-db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse")
+db = MySQLdb.connect(SSE_DB_HOST, SSE_DB_USER, SSE_DB_PASSWORD, SSE_DB_DATABASE)
cursor = db.cursor();
def commit():
@@ -19,18 +20,18 @@ def last_insert_id():
def new_package(archive, root, meta):
- cursor.execute('INSERT INTO package (path, timestamp, md) VALUES (%s, NOW(), %s)', (root + '/%s', meta["md"]))
+ cursor.execute('INSERT INTO package (crawler_id, path, timestamp, md) VALUES (%s, %s, NOW(), %s)', (SSE_CRAWLER_ID, root + '/%s', meta["md"]))
- return last_insert_id();
+ return (SSE_CRAWLER_ID, last_insert_id())
def find_package(md):
- cursor.execute('SELECT id FROM package WHERE md=%s', md)
+ cursor.execute('SELECT crawler_id, id FROM package WHERE md=%s', md)
if cursor.rowcount <= 0:
return None
- return int(cursor.fetchone()[0])
+ return (int(cursor.fetchone()[0]), int(cursor.fetchone()[1]))
def new_provider_record(recid, package_id, provider_id, meta):
@@ -54,14 +55,13 @@ def new_provider_record(recid, package_id, provider_id, meta):
except KeyError:
l = ""
- cursor.execute('REPLACE provider_record (id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s)', (recid, package_id, provider_id, name, url, download_url, l))
-
+ cursor.execute('REPLACE provider_record (id, crawler_id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (recid, package_id[0], package_id[1], provider_id, name, url, download_url, l))
def new_file(package_id, path, language_id = 0):
- cursor.execute('INSERT INTO file (package_id, path, language_id) VALUES (%s, %s, %s)', (package_id, path, language_id));
+ cursor.execute('INSERT INTO file (crawler_id, package_id, path, language_id) VALUES (%s, %s, %s)', (package_id[0], package_id[1], path, language_id));
- return last_insert_id()
+ return (SSE_CRAWLER_ID, last_insert_id())
def new_word(file_id, text, is_subword):
@@ -70,5 +70,5 @@ def new_word(file_id, text, is_subword):
else:
t = "word"
- cursor.execute('INSERT IGNORE INTO word (text, type, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, file_id))
- cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND file_id=%s', (text, t, file_id))
+ cursor.execute('INSERT IGNORE INTO word (text, type, crawler_id, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, crawler_id, file_id))
+ cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND crawler_id=%s AND file_id=%s', (text, t, crawler_id, file_id))
diff --git a/feed/sse_tar.py b/feed/sse_tar.py
index bf3cb8e..45700f9 100755
--- a/feed/sse_tar.py
+++ b/feed/sse_tar.py
@@ -115,12 +115,11 @@ def rm_rf(root):
def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER_NONE):
if recid is None:
- recid = archive
+ recid = os.path.basename(archive)
- md = calc_md(archive)
- meta["md"] = md
-
- root = os.path.join(HOME, "sources", md)
+ if not meta.has_key("md"):
+ md = calc_md(archive)
+ meta["md"] = md
sse_db.start_transaction()
@@ -128,19 +127,19 @@ def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER
package_id = sse_db.find_package(md)
if not package_id is None:
+
print "Package '%s' already in database." % recid
- if not recid is None:
- # Update provider record
- sse_db.new_provider_record(recid, package_id, provider_id, meta)
+ # Update provider record
+ sse_db.new_provider_record(recid, package_id, provider_id, meta)
else:
+ root = os.path.join(HOME, "sources", md)
+
package_id = sse_db.new_package(archive, root, meta)
print "Package '%s' is new in database." % recid
-
- if not recid is None:
- sse_db.new_provider_record(recid, package_id, provider_id, meta)
+ sse_db.new_provider_record(recid, package_id, provider_id, meta)
try:
rm_rf(root)
diff --git a/sse.sql b/sse.sql
index b7b1622..2345053 100644
--- a/sse.sql
+++ b/sse.sql
@@ -4,41 +4,64 @@ DROP TABLE word;
DROP TABLE file;
DROP TABLE package;
DROP TABLE provider_record;
+DROP TABLE crawler;
CREATE TABLE word (
text VARCHAR(40) NOT NULL,
type ENUM ('word', 'subword') DEFAULT 'word' NOT NULL,
+ crawler_id TINYINT UNSIGNED NOT NULL,
file_id INTEGER UNSIGNED NOT NULL,
+
cnt INTEGER UNSIGNED DEFAULT 0 NOT NULL,
- PRIMARY KEY (text, type, file_id)
+
+ PRIMARY KEY (text, type, crawler_id, file_id)
) ENGINE=InnoDB;
CREATE TABLE file (
+ crawler_id TINYINT UNSIGNED NOT NULL,
id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT,
+
package_id INTEGER UNSIGNED NOT NULL,
path VARBINARY(255) NOT NULL,
language_id TINYINT UNSIGNED NOT NULL,
- PRIMARY KEY (id),
- UNIQUE KEY (package_id, path)
+
+ PRIMARY KEY (crawler_id, id),
+ UNIQUE KEY (crawler_id, package_id, path)
) ENGINE=InnoDB;
CREATE TABLE package (
+ crawler_id TINYINT UNSIGNED NOT NULL,
id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT,
+
path VARBINARY(255) NOT NULL,
timestamp TIMESTAMP NOT NULL,
md CHAR(32) NOT NULL DEFAULT '',
- PRIMARY KEY(id),
+
+ PRIMARY KEY(crawler_id, id),
UNIQUE KEY (md)
) ENGINE=InnoDB;
CREATE TABLE provider_record (
id VARBINARY(64) NOT NULL,
+
+ crawler_id TINYINT UNSIGNED NOT NULL,
package_id INTEGER UNSIGNED NOT NULL,
+
provider_id TINYINT UNSIGNED NOT NULL,
name VARBINARY(255) NOT NULL DEFAULT 'noname',
url VARBINARY(255) NOT NULL DEFAULT '',
download_url VARBINARY(255) NOT NULL DEFAULT '',
license VARCHAR(64) NOT NULL DEFAULT '',
+
PRIMARY KEY(id),
- UNIQUE KEY (package_id, provider_id)
+ UNIQUE KEY (crawler_id, package_id, provider_id)
+) ENGINE=InnoDB;
+
+CREATE TABLE crawler (
+ id TINYINT UNSIGNED NOT NULL,
+
+ name VARCHAR(255) NOT NULL,
+ url VARBINATY(255) NOT NULL,
+
+ PRIMARY KEY(id)
) ENGINE=InnoDB;