diff options
author | Lennart Poettering <lennart@poettering.net> | 2005-11-21 23:06:55 +0000 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2005-11-21 23:06:55 +0000 |
commit | 71e7248cba9a5b78531aeaac7a58e811ec701dba (patch) | |
tree | 85087eaf6464bb47c14419e61243f5d01dbf050f | |
parent | e2df88d73130ed8237efeff3bdae9fd9f5e0c0a3 (diff) |
a days work
git-svn-id: file:///home/lennart/svn/public/sse/trunk@16 5fbabb74-0606-0410-a5e4-b5cc6a42724e
-rw-r--r-- | feed/sse_db.py | 75 | ||||
-rw-r--r-- | feed/sse_defs.py | 10 | ||||
-rwxr-xr-x | feed/sse_feed.py | 121 | ||||
-rwxr-xr-x | feed/sse_fm.py | 140 | ||||
-rw-r--r-- | feed/sse_grab.py | 79 | ||||
-rwxr-xr-x | feed/sse_tar.py | 136 |
6 files changed, 416 insertions, 145 deletions
diff --git a/feed/sse_db.py b/feed/sse_db.py new file mode 100644 index 0000000..4cf2af3 --- /dev/null +++ b/feed/sse_db.py @@ -0,0 +1,75 @@ + +import sys, os, MySQLdb, stat + +db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse") +cursor = db.cursor(); +cursor.execute("SET AUTOCOMMIT=0") + +def commit(): + cursor.execute('COMMIT') + +def rollback(): + cursor.execute('ROLLBACK') + +def start_transaction(): + cursor.execute('START TRANSACTION') + +def last_insert_id(): + cursor.execute('SELECT LAST_INSERT_ID()') + return int(cursor.fetchone()[0]) + +def new_package(archive, root, meta): + + cursor.execute('INSERT INTO package (path, timestamp, md) VALUES (%s, NOW(), %s)', (root + '/%s', meta["md"])) + + return last_insert_id(); + +def find_package(md): + + cursor.execute('SELECT id FROM package WHERE md=%s', md) + + if cursor.rowcount <= 0: + return None + + return int(cursor.fetchone()[0]) + +def new_provider_record(recid, package_id, provider_id, meta): + + try: + name = meta["name"] + except KeyError: + name = "noname" + + try: + url = meta["project-url"] + except KeyError: + url = "" + + try: + download_url = meta["archive-url"] + except KeyError: + download_url = "" + + try: + l = meta["license"] + except KeyError: + l = "" + + cursor.execute('REPLACE provider_record (id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s)', (recid, package_id, provider_id, name, url, download_url, l)) + + +def new_file(package_id, path, language_id = 0): + + cursor.execute('INSERT INTO file (package_id, path, language_id) VALUES (%s, %s, %s)', (package_id, path, language_id)); + + return last_insert_id() + +def new_word(file_id, text, is_subword): + + if is_subword: + t = "subword" + else: + t = "word" + + cursor.execute('INSERT IGNORE INTO word (text, type, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, file_id)) + cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND file_id=%s', (text, t, file_id)) diff --git a/feed/sse_defs.py b/feed/sse_defs.py new file mode 100644 index 0000000..9c2ce53 --- /dev/null +++ b/feed/sse_defs.py @@ -0,0 +1,10 @@ + + +SSE_PROVIDER_NONE = 0 +SSE_PROVIDER_FRESHMEAT = 1 +SSE_PROVIDER_DEBIAN = 2 + +SSE_LANGUAGE_C = 0 + +SSE_MAX_DOWNLOAD = 1024*1024*40 +SSE_BLOCK_SIZE = 10*1024 diff --git a/feed/sse_feed.py b/feed/sse_feed.py index c6f0de9..d31d826 100755 --- a/feed/sse_feed.py +++ b/feed/sse_feed.py @@ -1,81 +1,98 @@ #!/usr/bin/python -import sys, os, MySQLdb, stat +import sys, os, stat, string from popen2 import Popen3 -supported = [".c", ".h"] +import sse_db +from sse_config import * +from sse_defs import * -def supported_source(fn): +def camel_split(word): - for e in supported: - if fn.endswith(e): - return True + if len(word) <= 0: + return [] - return False + r = [] + last = 0 -def last_insert_id(cursor): - cursor.execute("SELECT LAST_INSERT_ID()"); - return cursor.fetchone()[0] + for i in range(0, len(word)-1): + + if word[i].islower() and word[i+1].isupper(): + r.append(word[last:i+1]) + last = i+1 -def process_file(package_id, root, path): - global cursor - print "Processing %s" % path + r.append(word[last:]) - cursor.execute("INSERT INTO file (package_id, path, language_id) VALUES (%i, '%s', '0')" % (package_id, path)); + return r + - file_id = last_insert_id(cursor); +def default_subword_split(word): + r = [] + + w = word.split("_") - p = Popen3("lex-c %s" % (os.path.join(root, path))) + if len(w) > 1: + delimiter = "_" + else: + w = camel_split(word) + delimiter = "" - for identifier in p.fromchild: - text = identifier.strip() - - cursor.execute("INSERT IGNORE INTO word (text, type, file_id) VALUES ('%s', 'word', '%i')" % (text, file_id)) - cursor.execute("UPDATE word SET cnt=cnt+1 WHERE text='%s' AND type='word' AND file_id=%i" % (text, file_id)) + if len(w) > 1: - if p.wait() != 0: - print "WARNING: Subprocess failed!" + for i in range(1, len(w)): - del p + if len(w[i]) == 0: + continue + + n = string.join(w[i:], delimiter) -def handle_file(package_id, root, path, filename): + if len(n) >= 4: + r.append(n) - t = sys.lstat(os.path.join(path, filename)) + return r + +supported_languages = [ { + "extensions" : [".c", ".h", ".cc", ".hh", ".cpp", ".hpp"], + "subword_split" : default_subword_split, + "lexer" : SSE_DIR+"/sse_lex_c", + "language_id" : SSE_LANGUAGE_C + }] - if stat.F_ISREG(t.st_mode): +def find_language(fn): + + for l in supported_languages: + for e in l["extensions"]: + if fn.lower().endswith(e): + return l - extension = filename.split(".")[-1] + return None - if extension in ("c", "h"): - process_file(package_id, root, os.path.join(path, filename)) - return +def supported_source(fn): + return not find_language(fn) is None - os.unlink(os.path.join(root, path, filename)) +def process_source(archive, root, path, package_id, meta): + print "(%s) Processing %s" % (archive, path) -def handle_tree(path, name, url, md): - global cursor + language = find_language(path) - cursor.execute("INSERT INTO package (path, name, url, timestamp, md) VALUES ('%s', '%s', '%s', NOW(), '%s')" % (path + "/%s", name, url, md)); - package_id = last_insert_id(cursor); + assert not language is None - path = os.path.realpath(path) - - for dirpath, dirs, files in os.walk(path): - for f in files: - assert path + "/" == (dirpath + "/") [:len(path)+1] + file_id = sse_db.new_file(package_id, path, language["language_id"]) - handle_file(package_id, path, dirpath[len(path)+1:], f) + p = Popen3("%s %s" % (language["lexer"], os.path.join(root, path))) -if __name__ == "__main__": - db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse") - cursor = db.cursor(); - cursor.execute("SET AUTOCOMMIT=0") - cursor.execute("START TRANSACTION") + subword_split = language["subword_split"] - assert len(sys.argv) == 5 + for identifier in p.fromchild: + + text = identifier.strip() + sse_db.new_word(file_id, text, False) - handle_tree(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) + subwords = subword_split(text) + for w in subwords: + sse_db.new_word(file_id, w, True) + + if p.wait() != 0: + print "WARNING: Subprocess failed!" - cursor.execute("COMMIT") - cursor.close() - db.close() + del p diff --git a/feed/sse_fm.py b/feed/sse_fm.py index dd45d58..7161d7a 100755 --- a/feed/sse_fm.py +++ b/feed/sse_fm.py @@ -1,89 +1,81 @@ #!/usr/bin/python - -import sys, urllib2, os, socket +import sys from xml.sax import ContentHandler, make_parser -from sse-config import * - -SSE_TAR = SSE_DIR + "sse-tar" -def process_tar(project, tar): - print "New tar %s" % tar +from sse_defs import * +import sse_grab + +license_blacklist = [ + "Other/Proprietary License with Free Trial", + "Free for non-commercial use", + "Free To Use But Restricted", + "Freely Distributable", + "Freeware", + "Shareware", + "Other/Proprietary License with Source", + "Other/Proprietary License", + ] + +license_whitelist = [ + "GNU General Public License (GPL)", + "GNU Lesser General Public License (LGPL)", + "OSI Approved", + "The Apache License", + "Q Public License (QPL)", + "Public Domain", + "BSD License (original)", + "Artistic License", + "MIT/X Consortium License", + "The Clarified Artistic License", + "BSD License (revised)" + ] + +def process_record(meta): - ret = os.system("%s '%s' '%s' '%s'" % (SSE_TAR, tar, project["name"], project["project-url"])) - - if ret != 0: - print "WARNING: Process returned %i" % ret + archive_url = None -def process_project(project): + for k, v in meta.items(): + meta[k] = v.strip() - archive_url = None + meta["id"] = int(meta["id"]) for a in ("archive-tgz-url", "archive-bz2-url", "archive-zip-url"): - if project.has_key(a) and project[a] != "": - archive_url = project[a] + if meta.has_key(a) and meta[a] != "": + archive_url = meta[a] break if archive_url is None: - print "WARNING: Ignoring project '%s' without archive URL!" % project["name"] + print "Ignoring project '%s' without archive URL!" % meta["name"] return - - fn = "%s/download/freshmeat-%i" % (HOME, int(project["id"])) - - download = False - - try: - f = open(fn+".release", "r") - except: - download = True - else: - download = f.read() != project["date"].strip() - - if not download: - print "File %s up-to-date." % archive_url - else: - print "Downloading %s..." % archive_url - - try: - dst = file(fn, "w") - src = urllib2.urlopen(archive_url) - - while True: - data = src.read(1024) + if meta["license"] in license_blacklist: + print "Ignoring project '%s' due to evil license '%s'!" % (meta["name"], meta["license"]) + return - if len(data) <= 0: - break + if meta["license"] not in license_whitelist: + print "WARNING: Unknown license '%s' for project '%s'!" % (meta["license"], meta["name"]) - dst.write(data) + f = file("graylist", "a") + f.write("%s\t%s\n" % (meta["name"], meta["license"])) + f.close() + return - del dst - del src - - except IOError, e: - os.unlink(fn) - print "WARNING: Failed to download %s!" % archive_url - return + meta["archive-url"] = archive_url - try: - f = open(fn+".release", "w") - except: - os.unlink(fn) + print "Next record '%s'" % meta["name"] - f.write(project["date"].strip()) - del f - - process_tar(project, fn) + sse_grab.grab_archive(meta, "freshmeat:%i" % meta["id"], SSE_PROVIDER_FRESHMEAT) class docHandler(ContentHandler): - project_data = {} + meta = {} field = None def startElement(self, name, attrs): if name == "project": - self.project_data = {} + self.meta = {} self.field = None elif name == "project_id": self.field = "id" @@ -100,35 +92,31 @@ class docHandler(ContentHandler): elif name == "license": self.field = "license" elif name == "latest_release_date": - self.field = "date" + self.field = "version" else: self.field = None def characters(self, data): if not self.field is None: - if self.project_data.has_key(self.field): - self.project_data[self.field] += data + if self.meta.has_key(self.field): + self.meta[self.field] += data else: - self.project_data[self.field] = data + self.meta[self.field] = data def endElement(self, name): if name == "project": - process_project(self.project_data) - self.project_data = None + process_record(self.meta) + self.meta = None self.field = None -try: - os.mkdir("%s/download" % HOME) -except: - pass - -socket.setdefaulttimeout(20) - -dh = docHandler() +def parse_xml(f): -parser = make_parser() + dh = docHandler() + parser = make_parser() + parser.setContentHandler(dh) + parser.parse(f) -parser.setContentHandler(dh) -parser.parse(sys.stdin) +if __name__ == "__main__": + parse_xml(sys.stdin) diff --git a/feed/sse_grab.py b/feed/sse_grab.py new file mode 100644 index 0000000..6142f27 --- /dev/null +++ b/feed/sse_grab.py @@ -0,0 +1,79 @@ +#!/usr/bin/python + +import sys, urllib2, os, socket + +import sse_tar +from sse_defs import * +from sse_config import * + +socket.setdefaulttimeout(20) + +def grab_archive(meta, recid, provider_id = SSE_PROVIDER_NONE): + + try: + os.mkdir("%s/download" % HOME) + except: + pass + + fn = os.path.join(HOME, "download", recid) + + download = False + + try: + f = open(fn+".release", "r") + except: + download = True + else: + download = f.read() != meta["version"] + f.close() + + archive_url = meta["archive-url"] + + if not download: + print "File %s up-to-date." % archive_url + else: + + print "Downloading %s..." % archive_url + + dst = None + src = None + + try: + dst = file(fn, "w") + src = urllib2.urlopen(archive_url) + m = 0 + + while True: + + if m > SSE_MAX_DOWNLOAD: + os.unlink(fn) + print "WARNING: Archive too large, ignoring." + return + + data = src.read(SSE_BLOCK_SIZE) + + if len(data) <= 0: + break + + dst.write(data) + m += len(data) + + dst.close() + del src + + except IOError, e: + os.unlink(fn) + print "WARNING: Failed to download %s!" % archive_url + return + + sse_tar.process_archive(fn, meta, recid, provider_id) + os.unlink(fn) + + try: + f = open(fn+".release", "w") + except: + os.unlink(fn) + + f.write(meta["version"]) + f.close() + diff --git a/feed/sse_tar.py b/feed/sse_tar.py index 79671f6..bb01987 100755 --- a/feed/sse_tar.py +++ b/feed/sse_tar.py @@ -1,12 +1,19 @@ #!/usr/bin/python -import tarfile, zipfile, sys, os -import sse_feed +import sse_feed, sse_db -def archive_uncompress(archive, root, meta = {}): +from sse_config import * +from sse_defs import * +import tarfile, zipfile, sys, os, time +from md5 import new as message_digest + +def uncompress_tar(archive, root, package_id, meta = {}): + n = 0 f = tarfile.open(archive, "r") + print "Processing TAR file %s." % archive + while True: i = f.next() @@ -21,46 +28,141 @@ def archive_uncompress(archive, root, meta = {}): continue dst = os.path.join(root, i.name) - f.extract(i, dst) + + try: + os.makedirs(os.path.dirname(dst)) + except: + pass + + x = f.extractfile(i) + o = file(dst, "w") + while True: + data = x.read(SSE_BLOCK_SIZE) + + if len(data) <= 0: + break + + o.write(data) + o.close() + os.utime(dst, (i.mtime, i.mtime)) - sse_feed.process_source(archive, root, i.name, meta) + sse_feed.process_source(archive, root, i.name, package_id, meta) + n += 1 + + f.close() + return n + +def uncompress_zip(archive, root, package_id, meta = {}): + n = 0 + f = zipfile.ZipFile(archive, "r") + + print "Processing ZIP file %s." % archive + + for i in f.infolist(): + + if not sse_feed.supported_source(i.filename): + continue + + dst = os.path.join(root, i.filename) + + try: + os.makedirs(os.path.dirname(dst)) + except: + pass + + o = file(dst, "w") + o.write(f.read(i.filename)) + o.close() + + (year, month, day, hour, minute, second) = i.date_time + t = time.mktime([year, month, day, hour, minute, second, 0, 0, 0]) + os.utime(dst, (t, t)) - del f + sse_feed.process_source(archive, root, i.filename, package_id, meta) + n += 1 + + f.close() + return n def calc_md(fn): - m = md5.new() - f = fopen(fn) + m = message_digest() + f = file(fn) while True: - data = m.read(1024) + data = f.read(1024) if len(data) <= 0: break m.update(data) - del f + f.close() return m.hexdigest() + +def rm_rf(root): + + for root, dirs, files in os.walk(root, topdown = False): + for f in files: + os.remove(os.path.join(root, f)) + for d in dirs: + os.rmdir(os.path.join(root, d)) + + os.rmdir(root) -def process_archive(archive, meta = {}): + +def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER_NONE): + + if recid is None: + recid = archive md = calc_md(archive) root = os.path.join(HOME, "sources", md) - + try: - os.mkdir(root) + rm_rf(root) except: pass + os.makedirs(root) meta["md"] = md - - archive_uncompress(archive, root, meta) + + sse_db.start_transaction() + + package_id = sse_db.find_package(md) + + if package_id is None: + package_id = sse_db.new_package(archive, root, meta) + + print "Package '%s' is new in database." % recid + + if not recid is None: + sse_db.new_provider_record(recid, package_id, provider_id, meta) + + try: + n = uncompress_tar(archive, root, package_id, meta) + except tarfile.TarError: + try: + n = uncompress_zip(archive, root, package_id, meta) + except zipfile.error: + n = None + print "Unknown file format." + + if not n is None: + print "Successfully processed %i files." % n + + else: + + print "Package '%s' already in database." % recid + + if not recid is None: + sse_db.new_provider_record(recid, package_id, provider_id, meta) + + sse_db.commit() if __name__ == "__main__": - - archive_uncompress(sys.argv[1]) + process_archive(sys.argv[1]) |