summaryrefslogtreecommitdiffstats
path: root/feed/sse_fm.py
blob: 0edb92dc9f5e9b0692510e443b435ff23e1f7213 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python

import sys
from xml.sax import ContentHandler, make_parser

from sse_defs import *
import sse_grab

license_blacklist = [
    "Other/Proprietary License with Free Trial",
    "Free for non-commercial use",
    "Free To Use But Restricted",
    "Freely Distributable",
    "Freeware",
    "Shareware",
    "Other/Proprietary License with Source",
    "Other/Proprietary License",
    ]

license_whitelist = [
    "GNU General Public License (GPL)",
    "GNU Lesser General Public License (LGPL)",
    "OSI Approved",
    "The Apache License",
    "Q Public License (QPL)",
    "Public Domain",
    "BSD License (original)",
    "Artistic License",
    "MIT/X Consortium License",
    "The Clarified Artistic License",
    "BSD License (revised)",
    "DFSG approved"
    ]

def process_record(meta):

    archive_url = None

    for k, v in meta.items():
        meta[k] = v.strip()

    meta["id"] = int(meta["id"])

    for a in ("archive-tgz-url", "archive-bz2-url", "archive-zip-url"):

        if meta.has_key(a) and meta[a] != "":
            archive_url = meta[a]
            break

    if archive_url is None:
        print "Ignoring project '%s' without archive URL!" % meta["name"]
        return

    if meta["license"] in license_blacklist:
        print "Ignoring project '%s' due to evil license '%s'!" % (meta["name"], meta["license"])
        return

    if meta["license"] not in license_whitelist:
        print "WARNING: Unknown license '%s' for project '%s'!" % (meta["license"], meta["name"])
        
        f = file("graylist", "a")
        f.write("%s\t%s\n" % (meta["name"], meta["license"]))
        f.close()
        return

    meta["archive-url"] = archive_url

    print "Next record '%s'" % meta["name"]

    sse_grab.grab_archive(meta, "freshmeat:%i" % meta["id"], SSE_PROVIDER_FRESHMEAT)

class docHandler(ContentHandler):

    meta = {}
    field = None

    def startElement(self, name, attrs):
        if name == "project":
            self.meta = {}
            self.field = None
        elif name == "project_id":
            self.field = "id"
        elif name == "projectname_full":
            self.field = "name"
        elif name == "url_project_page":
            self.field = "project-url"
        elif name == "url_tgz":
            self.field = "archive-tgz-url"
        elif name == "url_bz2":
            self.field = "archive-bz2-url"
        elif name == "url_zip":
            self.field = "archive-zip-url"
        elif name == "license":
            self.field = "license"
        elif name == "latest_release_date":
            self.field = "version"
        else:
            self.field = None

    def characters(self, data):
        
        if not self.field is None:
            if self.meta.has_key(self.field):
                self.meta[self.field] += data
            else:
                self.meta[self.field] = data

    def endElement(self, name):
        if name == "project":
            process_record(self.meta)
            self.meta = None

        self.field = None

def parse_xml(f):

    dh = docHandler()
    parser = make_parser()
    parser.setContentHandler(dh)
    parser.parse(f)

if __name__ == "__main__":
    parse_xml(sys.stdin)