#!/usr/bin/python import sys from xml.sax import ContentHandler, make_parser from sse_defs import * import sse_grab license_blacklist = [ "Other/Proprietary License with Free Trial", "Free for non-commercial use", "Free To Use But Restricted", "Freely Distributable", "Freeware", "Shareware", "Other/Proprietary License with Source", "Other/Proprietary License", "Aladdin Free Public License (AFPL)", "Free For Home Use" ] license_whitelist = [ "GNU General Public License (GPL)", "GNU Lesser General Public License (LGPL)", "OSI Approved", "The Apache License", "Q Public License (QPL)", "Public Domain", "BSD License (original)", "Artistic License", "MIT/X Consortium License", "The Clarified Artistic License", "BSD License (revised)", "DFSG approved", "GNU Free Documentation License (FDL)", "W3C License", "Mozilla Public License (MPL)", "The Apache License 2.0" ] def process_record(meta): for k, v in meta.items(): meta[k] = v.strip() meta["id"] = int(meta["id"]) archive_url = None for a in ("archive-tgz-url", "archive-bz2-url", "archive-zip-url"): if meta.has_key(a) and meta[a] != "": archive_url = meta[a] break if archive_url is None: print "Ignoring project '%s' without archive URL!" % meta["name"] return meta["archive-url"] = archive_url if not meta.has_key("license"): print "WARNING: Project '%s' has no license!" % meta["name"] return if meta["license"] in license_blacklist: print "Ignoring project '%s' due to evil license '%s'!" % (meta["name"], meta["license"]) return if meta["license"] not in license_whitelist: print "WARNING: Unknown license '%s' for project '%s'!" % (meta["license"], meta["name"]) f = file("graylist", "a") f.write("%s\t%s\n" % (meta["name"], meta["license"])) f.close() return print "Next record '%s'" % meta["name"] sse_grab.grab_archive(meta, "freshmeat:%i" % meta["id"], SSE_PROVIDER_FRESHMEAT) class docHandler(ContentHandler): meta = {} field = None def startElement(self, name, attrs): if name == "project": self.meta = {} self.field = None elif name == "project_id": self.field = "id" elif name == "projectname_full": self.field = "name" elif name == "url_project_page": self.field = "project-url" elif name == "url_tgz": self.field = "archive-tgz-url" elif name == "url_bz2": self.field = "archive-bz2-url" elif name == "url_zip": self.field = "archive-zip-url" elif name == "license": self.field = "license" elif name == "latest_release_date": self.field = "version" else: self.field = None def characters(self, data): if not self.field is None: if self.meta.has_key(self.field): self.meta[self.field] += data else: self.meta[self.field] = data def endElement(self, name): if name == "project": process_record(self.meta) self.meta = None self.field = None def parse_xml(f): dh = docHandler() parser = make_parser() parser.setContentHandler(dh) parser.parse(f) if __name__ == "__main__": parse_xml(sys.stdin)