summaryrefslogtreecommitdiffstats
path: root/feed
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2005-11-21 23:06:55 +0000
committerLennart Poettering <lennart@poettering.net>2005-11-21 23:06:55 +0000
commit71e7248cba9a5b78531aeaac7a58e811ec701dba (patch)
tree85087eaf6464bb47c14419e61243f5d01dbf050f /feed
parente2df88d73130ed8237efeff3bdae9fd9f5e0c0a3 (diff)
a days work
git-svn-id: file:///home/lennart/svn/public/sse/trunk@16 5fbabb74-0606-0410-a5e4-b5cc6a42724e
Diffstat (limited to 'feed')
-rw-r--r--feed/sse_db.py75
-rw-r--r--feed/sse_defs.py10
-rwxr-xr-xfeed/sse_feed.py121
-rwxr-xr-xfeed/sse_fm.py140
-rw-r--r--feed/sse_grab.py79
-rwxr-xr-xfeed/sse_tar.py136
6 files changed, 416 insertions, 145 deletions
diff --git a/feed/sse_db.py b/feed/sse_db.py
new file mode 100644
index 0000000..4cf2af3
--- /dev/null
+++ b/feed/sse_db.py
@@ -0,0 +1,75 @@
+
+import sys, os, MySQLdb, stat
+
+db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse")
+cursor = db.cursor();
+cursor.execute("SET AUTOCOMMIT=0")
+
+def commit():
+ cursor.execute('COMMIT')
+
+def rollback():
+ cursor.execute('ROLLBACK')
+
+def start_transaction():
+ cursor.execute('START TRANSACTION')
+
+def last_insert_id():
+ cursor.execute('SELECT LAST_INSERT_ID()')
+ return int(cursor.fetchone()[0])
+
+def new_package(archive, root, meta):
+
+ cursor.execute('INSERT INTO package (path, timestamp, md) VALUES (%s, NOW(), %s)', (root + '/%s', meta["md"]))
+
+ return last_insert_id();
+
+def find_package(md):
+
+ cursor.execute('SELECT id FROM package WHERE md=%s', md)
+
+ if cursor.rowcount <= 0:
+ return None
+
+ return int(cursor.fetchone()[0])
+
+def new_provider_record(recid, package_id, provider_id, meta):
+
+ try:
+ name = meta["name"]
+ except KeyError:
+ name = "noname"
+
+ try:
+ url = meta["project-url"]
+ except KeyError:
+ url = ""
+
+ try:
+ download_url = meta["archive-url"]
+ except KeyError:
+ download_url = ""
+
+ try:
+ l = meta["license"]
+ except KeyError:
+ l = ""
+
+ cursor.execute('REPLACE provider_record (id, package_id, provider_id, name, url, download_url, license) VALUES (%s, %s, %s, %s, %s, %s, %s)', (recid, package_id, provider_id, name, url, download_url, l))
+
+
+def new_file(package_id, path, language_id = 0):
+
+ cursor.execute('INSERT INTO file (package_id, path, language_id) VALUES (%s, %s, %s)', (package_id, path, language_id));
+
+ return last_insert_id()
+
+def new_word(file_id, text, is_subword):
+
+ if is_subword:
+ t = "subword"
+ else:
+ t = "word"
+
+ cursor.execute('INSERT IGNORE INTO word (text, type, file_id, cnt) VALUES (%s, %s, %s, 0)', (text, t, file_id))
+ cursor.execute('UPDATE word SET cnt=cnt+1 WHERE text=%s AND type=%s AND file_id=%s', (text, t, file_id))
diff --git a/feed/sse_defs.py b/feed/sse_defs.py
new file mode 100644
index 0000000..9c2ce53
--- /dev/null
+++ b/feed/sse_defs.py
@@ -0,0 +1,10 @@
+
+
+SSE_PROVIDER_NONE = 0
+SSE_PROVIDER_FRESHMEAT = 1
+SSE_PROVIDER_DEBIAN = 2
+
+SSE_LANGUAGE_C = 0
+
+SSE_MAX_DOWNLOAD = 1024*1024*40
+SSE_BLOCK_SIZE = 10*1024
diff --git a/feed/sse_feed.py b/feed/sse_feed.py
index c6f0de9..d31d826 100755
--- a/feed/sse_feed.py
+++ b/feed/sse_feed.py
@@ -1,81 +1,98 @@
#!/usr/bin/python
-import sys, os, MySQLdb, stat
+import sys, os, stat, string
from popen2 import Popen3
-supported = [".c", ".h"]
+import sse_db
+from sse_config import *
+from sse_defs import *
-def supported_source(fn):
+def camel_split(word):
- for e in supported:
- if fn.endswith(e):
- return True
+ if len(word) <= 0:
+ return []
- return False
+ r = []
+ last = 0
-def last_insert_id(cursor):
- cursor.execute("SELECT LAST_INSERT_ID()");
- return cursor.fetchone()[0]
+ for i in range(0, len(word)-1):
+
+ if word[i].islower() and word[i+1].isupper():
+ r.append(word[last:i+1])
+ last = i+1
-def process_file(package_id, root, path):
- global cursor
- print "Processing %s" % path
+ r.append(word[last:])
- cursor.execute("INSERT INTO file (package_id, path, language_id) VALUES (%i, '%s', '0')" % (package_id, path));
+ return r
+
- file_id = last_insert_id(cursor);
+def default_subword_split(word):
+ r = []
+
+ w = word.split("_")
- p = Popen3("lex-c %s" % (os.path.join(root, path)))
+ if len(w) > 1:
+ delimiter = "_"
+ else:
+ w = camel_split(word)
+ delimiter = ""
- for identifier in p.fromchild:
- text = identifier.strip()
-
- cursor.execute("INSERT IGNORE INTO word (text, type, file_id) VALUES ('%s', 'word', '%i')" % (text, file_id))
- cursor.execute("UPDATE word SET cnt=cnt+1 WHERE text='%s' AND type='word' AND file_id=%i" % (text, file_id))
+ if len(w) > 1:
- if p.wait() != 0:
- print "WARNING: Subprocess failed!"
+ for i in range(1, len(w)):
- del p
+ if len(w[i]) == 0:
+ continue
+
+ n = string.join(w[i:], delimiter)
-def handle_file(package_id, root, path, filename):
+ if len(n) >= 4:
+ r.append(n)
- t = sys.lstat(os.path.join(path, filename))
+ return r
+
+supported_languages = [ {
+ "extensions" : [".c", ".h", ".cc", ".hh", ".cpp", ".hpp"],
+ "subword_split" : default_subword_split,
+ "lexer" : SSE_DIR+"/sse_lex_c",
+ "language_id" : SSE_LANGUAGE_C
+ }]
- if stat.F_ISREG(t.st_mode):
+def find_language(fn):
+
+ for l in supported_languages:
+ for e in l["extensions"]:
+ if fn.lower().endswith(e):
+ return l
- extension = filename.split(".")[-1]
+ return None
- if extension in ("c", "h"):
- process_file(package_id, root, os.path.join(path, filename))
- return
+def supported_source(fn):
+ return not find_language(fn) is None
- os.unlink(os.path.join(root, path, filename))
+def process_source(archive, root, path, package_id, meta):
+ print "(%s) Processing %s" % (archive, path)
-def handle_tree(path, name, url, md):
- global cursor
+ language = find_language(path)
- cursor.execute("INSERT INTO package (path, name, url, timestamp, md) VALUES ('%s', '%s', '%s', NOW(), '%s')" % (path + "/%s", name, url, md));
- package_id = last_insert_id(cursor);
+ assert not language is None
- path = os.path.realpath(path)
-
- for dirpath, dirs, files in os.walk(path):
- for f in files:
- assert path + "/" == (dirpath + "/") [:len(path)+1]
+ file_id = sse_db.new_file(package_id, path, language["language_id"])
- handle_file(package_id, path, dirpath[len(path)+1:], f)
+ p = Popen3("%s %s" % (language["lexer"], os.path.join(root, path)))
-if __name__ == "__main__":
- db = MySQLdb.connect(host = "localhost", user = "sse_web", passwd = "ece6Yoli", db = "sse")
- cursor = db.cursor();
- cursor.execute("SET AUTOCOMMIT=0")
- cursor.execute("START TRANSACTION")
+ subword_split = language["subword_split"]
- assert len(sys.argv) == 5
+ for identifier in p.fromchild:
+
+ text = identifier.strip()
+ sse_db.new_word(file_id, text, False)
- handle_tree(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
+ subwords = subword_split(text)
+ for w in subwords:
+ sse_db.new_word(file_id, w, True)
+
+ if p.wait() != 0:
+ print "WARNING: Subprocess failed!"
- cursor.execute("COMMIT")
- cursor.close()
- db.close()
+ del p
diff --git a/feed/sse_fm.py b/feed/sse_fm.py
index dd45d58..7161d7a 100755
--- a/feed/sse_fm.py
+++ b/feed/sse_fm.py
@@ -1,89 +1,81 @@
#!/usr/bin/python
-
-import sys, urllib2, os, socket
+import sys
from xml.sax import ContentHandler, make_parser
-from sse-config import *
-
-SSE_TAR = SSE_DIR + "sse-tar"
-def process_tar(project, tar):
- print "New tar %s" % tar
+from sse_defs import *
+import sse_grab
+
+license_blacklist = [
+ "Other/Proprietary License with Free Trial",
+ "Free for non-commercial use",
+ "Free To Use But Restricted",
+ "Freely Distributable",
+ "Freeware",
+ "Shareware",
+ "Other/Proprietary License with Source",
+ "Other/Proprietary License",
+ ]
+
+license_whitelist = [
+ "GNU General Public License (GPL)",
+ "GNU Lesser General Public License (LGPL)",
+ "OSI Approved",
+ "The Apache License",
+ "Q Public License (QPL)",
+ "Public Domain",
+ "BSD License (original)",
+ "Artistic License",
+ "MIT/X Consortium License",
+ "The Clarified Artistic License",
+ "BSD License (revised)"
+ ]
+
+def process_record(meta):
- ret = os.system("%s '%s' '%s' '%s'" % (SSE_TAR, tar, project["name"], project["project-url"]))
-
- if ret != 0:
- print "WARNING: Process returned %i" % ret
+ archive_url = None
-def process_project(project):
+ for k, v in meta.items():
+ meta[k] = v.strip()
- archive_url = None
+ meta["id"] = int(meta["id"])
for a in ("archive-tgz-url", "archive-bz2-url", "archive-zip-url"):
- if project.has_key(a) and project[a] != "":
- archive_url = project[a]
+ if meta.has_key(a) and meta[a] != "":
+ archive_url = meta[a]
break
if archive_url is None:
- print "WARNING: Ignoring project '%s' without archive URL!" % project["name"]
+ print "Ignoring project '%s' without archive URL!" % meta["name"]
return
-
- fn = "%s/download/freshmeat-%i" % (HOME, int(project["id"]))
-
- download = False
-
- try:
- f = open(fn+".release", "r")
- except:
- download = True
- else:
- download = f.read() != project["date"].strip()
-
- if not download:
- print "File %s up-to-date." % archive_url
- else:
- print "Downloading %s..." % archive_url
-
- try:
- dst = file(fn, "w")
- src = urllib2.urlopen(archive_url)
-
- while True:
- data = src.read(1024)
+ if meta["license"] in license_blacklist:
+ print "Ignoring project '%s' due to evil license '%s'!" % (meta["name"], meta["license"])
+ return
- if len(data) <= 0:
- break
+ if meta["license"] not in license_whitelist:
+ print "WARNING: Unknown license '%s' for project '%s'!" % (meta["license"], meta["name"])
- dst.write(data)
+ f = file("graylist", "a")
+ f.write("%s\t%s\n" % (meta["name"], meta["license"]))
+ f.close()
+ return
- del dst
- del src
-
- except IOError, e:
- os.unlink(fn)
- print "WARNING: Failed to download %s!" % archive_url
- return
+ meta["archive-url"] = archive_url
- try:
- f = open(fn+".release", "w")
- except:
- os.unlink(fn)
+ print "Next record '%s'" % meta["name"]
- f.write(project["date"].strip())
- del f
-
- process_tar(project, fn)
+ sse_grab.grab_archive(meta, "freshmeat:%i" % meta["id"], SSE_PROVIDER_FRESHMEAT)
class docHandler(ContentHandler):
- project_data = {}
+ meta = {}
field = None
def startElement(self, name, attrs):
if name == "project":
- self.project_data = {}
+ self.meta = {}
self.field = None
elif name == "project_id":
self.field = "id"
@@ -100,35 +92,31 @@ class docHandler(ContentHandler):
elif name == "license":
self.field = "license"
elif name == "latest_release_date":
- self.field = "date"
+ self.field = "version"
else:
self.field = None
def characters(self, data):
if not self.field is None:
- if self.project_data.has_key(self.field):
- self.project_data[self.field] += data
+ if self.meta.has_key(self.field):
+ self.meta[self.field] += data
else:
- self.project_data[self.field] = data
+ self.meta[self.field] = data
def endElement(self, name):
if name == "project":
- process_project(self.project_data)
- self.project_data = None
+ process_record(self.meta)
+ self.meta = None
self.field = None
-try:
- os.mkdir("%s/download" % HOME)
-except:
- pass
-
-socket.setdefaulttimeout(20)
-
-dh = docHandler()
+def parse_xml(f):
-parser = make_parser()
+ dh = docHandler()
+ parser = make_parser()
+ parser.setContentHandler(dh)
+ parser.parse(f)
-parser.setContentHandler(dh)
-parser.parse(sys.stdin)
+if __name__ == "__main__":
+ parse_xml(sys.stdin)
diff --git a/feed/sse_grab.py b/feed/sse_grab.py
new file mode 100644
index 0000000..6142f27
--- /dev/null
+++ b/feed/sse_grab.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+
+import sys, urllib2, os, socket
+
+import sse_tar
+from sse_defs import *
+from sse_config import *
+
+socket.setdefaulttimeout(20)
+
+def grab_archive(meta, recid, provider_id = SSE_PROVIDER_NONE):
+
+ try:
+ os.mkdir("%s/download" % HOME)
+ except:
+ pass
+
+ fn = os.path.join(HOME, "download", recid)
+
+ download = False
+
+ try:
+ f = open(fn+".release", "r")
+ except:
+ download = True
+ else:
+ download = f.read() != meta["version"]
+ f.close()
+
+ archive_url = meta["archive-url"]
+
+ if not download:
+ print "File %s up-to-date." % archive_url
+ else:
+
+ print "Downloading %s..." % archive_url
+
+ dst = None
+ src = None
+
+ try:
+ dst = file(fn, "w")
+ src = urllib2.urlopen(archive_url)
+ m = 0
+
+ while True:
+
+ if m > SSE_MAX_DOWNLOAD:
+ os.unlink(fn)
+ print "WARNING: Archive too large, ignoring."
+ return
+
+ data = src.read(SSE_BLOCK_SIZE)
+
+ if len(data) <= 0:
+ break
+
+ dst.write(data)
+ m += len(data)
+
+ dst.close()
+ del src
+
+ except IOError, e:
+ os.unlink(fn)
+ print "WARNING: Failed to download %s!" % archive_url
+ return
+
+ sse_tar.process_archive(fn, meta, recid, provider_id)
+ os.unlink(fn)
+
+ try:
+ f = open(fn+".release", "w")
+ except:
+ os.unlink(fn)
+
+ f.write(meta["version"])
+ f.close()
+
diff --git a/feed/sse_tar.py b/feed/sse_tar.py
index 79671f6..bb01987 100755
--- a/feed/sse_tar.py
+++ b/feed/sse_tar.py
@@ -1,12 +1,19 @@
#!/usr/bin/python
-import tarfile, zipfile, sys, os
-import sse_feed
+import sse_feed, sse_db
-def archive_uncompress(archive, root, meta = {}):
+from sse_config import *
+from sse_defs import *
+import tarfile, zipfile, sys, os, time
+from md5 import new as message_digest
+
+def uncompress_tar(archive, root, package_id, meta = {}):
+ n = 0
f = tarfile.open(archive, "r")
+ print "Processing TAR file %s." % archive
+
while True:
i = f.next()
@@ -21,46 +28,141 @@ def archive_uncompress(archive, root, meta = {}):
continue
dst = os.path.join(root, i.name)
- f.extract(i, dst)
+
+ try:
+ os.makedirs(os.path.dirname(dst))
+ except:
+ pass
+
+ x = f.extractfile(i)
+ o = file(dst, "w")
+ while True:
+ data = x.read(SSE_BLOCK_SIZE)
+
+ if len(data) <= 0:
+ break
+
+ o.write(data)
+ o.close()
+
os.utime(dst, (i.mtime, i.mtime))
- sse_feed.process_source(archive, root, i.name, meta)
+ sse_feed.process_source(archive, root, i.name, package_id, meta)
+ n += 1
+
+ f.close()
+ return n
+
+def uncompress_zip(archive, root, package_id, meta = {}):
+ n = 0
+ f = zipfile.ZipFile(archive, "r")
+
+ print "Processing ZIP file %s." % archive
+
+ for i in f.infolist():
+
+ if not sse_feed.supported_source(i.filename):
+ continue
+
+ dst = os.path.join(root, i.filename)
+
+ try:
+ os.makedirs(os.path.dirname(dst))
+ except:
+ pass
+
+ o = file(dst, "w")
+ o.write(f.read(i.filename))
+ o.close()
+
+ (year, month, day, hour, minute, second) = i.date_time
+ t = time.mktime([year, month, day, hour, minute, second, 0, 0, 0])
+ os.utime(dst, (t, t))
- del f
+ sse_feed.process_source(archive, root, i.filename, package_id, meta)
+ n += 1
+
+ f.close()
+ return n
def calc_md(fn):
- m = md5.new()
- f = fopen(fn)
+ m = message_digest()
+ f = file(fn)
while True:
- data = m.read(1024)
+ data = f.read(1024)
if len(data) <= 0:
break
m.update(data)
- del f
+ f.close()
return m.hexdigest()
+
+def rm_rf(root):
+
+ for root, dirs, files in os.walk(root, topdown = False):
+ for f in files:
+ os.remove(os.path.join(root, f))
+ for d in dirs:
+ os.rmdir(os.path.join(root, d))
+
+ os.rmdir(root)
-def process_archive(archive, meta = {}):
+
+def process_archive(archive, meta = {}, recid = None, provider_id = SSE_PROVIDER_NONE):
+
+ if recid is None:
+ recid = archive
md = calc_md(archive)
root = os.path.join(HOME, "sources", md)
-
+
try:
- os.mkdir(root)
+ rm_rf(root)
except:
pass
+ os.makedirs(root)
meta["md"] = md
-
- archive_uncompress(archive, root, meta)
+
+ sse_db.start_transaction()
+
+ package_id = sse_db.find_package(md)
+
+ if package_id is None:
+ package_id = sse_db.new_package(archive, root, meta)
+
+ print "Package '%s' is new in database." % recid
+
+ if not recid is None:
+ sse_db.new_provider_record(recid, package_id, provider_id, meta)
+
+ try:
+ n = uncompress_tar(archive, root, package_id, meta)
+ except tarfile.TarError:
+ try:
+ n = uncompress_zip(archive, root, package_id, meta)
+ except zipfile.error:
+ n = None
+ print "Unknown file format."
+
+ if not n is None:
+ print "Successfully processed %i files." % n
+
+ else:
+
+ print "Package '%s' already in database." % recid
+
+ if not recid is None:
+ sse_db.new_provider_record(recid, package_id, provider_id, meta)
+
+ sse_db.commit()
if __name__ == "__main__":
-
- archive_uncompress(sys.argv[1])
+ process_archive(sys.argv[1])