1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
#!/usr/bin/python
import sys
from xml.sax import ContentHandler, make_parser
from sse_defs import *
import sse_grab
license_blacklist = [
"Other/Proprietary License with Free Trial",
"Free for non-commercial use",
"Free To Use But Restricted",
"Freely Distributable",
"Freeware",
"Shareware",
"Other/Proprietary License with Source",
"Other/Proprietary License",
]
license_whitelist = [
"GNU General Public License (GPL)",
"GNU Lesser General Public License (LGPL)",
"OSI Approved",
"The Apache License",
"Q Public License (QPL)",
"Public Domain",
"BSD License (original)",
"Artistic License",
"MIT/X Consortium License",
"The Clarified Artistic License",
"BSD License (revised)",
"DFSG approved"
]
def process_record(meta):
archive_url = None
for k, v in meta.items():
meta[k] = v.strip()
meta["id"] = int(meta["id"])
for a in ("archive-tgz-url", "archive-bz2-url", "archive-zip-url"):
if meta.has_key(a) and meta[a] != "":
archive_url = meta[a]
break
if archive_url is None:
print "Ignoring project '%s' without archive URL!" % meta["name"]
return
if meta["license"] in license_blacklist:
print "Ignoring project '%s' due to evil license '%s'!" % (meta["name"], meta["license"])
return
if meta["license"] not in license_whitelist:
print "WARNING: Unknown license '%s' for project '%s'!" % (meta["license"], meta["name"])
f = file("graylist", "a")
f.write("%s\t%s\n" % (meta["name"], meta["license"]))
f.close()
return
meta["archive-url"] = archive_url
print "Next record '%s'" % meta["name"]
sse_grab.grab_archive(meta, "freshmeat:%i" % meta["id"], SSE_PROVIDER_FRESHMEAT)
class docHandler(ContentHandler):
meta = {}
field = None
def startElement(self, name, attrs):
if name == "project":
self.meta = {}
self.field = None
elif name == "project_id":
self.field = "id"
elif name == "projectname_full":
self.field = "name"
elif name == "url_project_page":
self.field = "project-url"
elif name == "url_tgz":
self.field = "archive-tgz-url"
elif name == "url_bz2":
self.field = "archive-bz2-url"
elif name == "url_zip":
self.field = "archive-zip-url"
elif name == "license":
self.field = "license"
elif name == "latest_release_date":
self.field = "version"
else:
self.field = None
def characters(self, data):
if not self.field is None:
if self.meta.has_key(self.field):
self.meta[self.field] += data
else:
self.meta[self.field] = data
def endElement(self, name):
if name == "project":
process_record(self.meta)
self.meta = None
self.field = None
def parse_xml(f):
dh = docHandler()
parser = make_parser()
parser.setContentHandler(dh)
parser.parse(f)
if __name__ == "__main__":
parse_xml(sys.stdin)
|