BitTorrent/BeautifulSupe.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

# A very very minimal BeautifulSoup immitation.
#
# BS uses SGMLlib to parse, which converts everything to lower case.
# This uses real xml parsing to mimic the parts of BS we use.

import xml.dom.minidom

def _getText(node):
    nodelist = node.childNodes
    rc = []
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc.append(str(node.data))
    return rc

def _getNodesAsTags(root):
    nodelist = root.childNodes
    tags = []
    for node in nodelist:
        if node.nodeType == node.ELEMENT_NODE:
            tags.append(Tag(node))
    return tags

class Tag(object):
    def __init__(self, node):
        self.node = node
        self.name = node.nodeName
        self.contents = _getNodesAsTags(self.node)
        text = _getText(self.node)
        self.contents += text
        self.text = ''.join(text)

    def child_elements(self):
        children = []
        for tag in self.contents:
            if isinstance(tag, Tag):
                children.append(tag)
        return children

    def get(self, tagname):
        got = self.first(tagname)
        if got:
            return got.text

    def first(self, tagname):
        found = None
        
        for tag in self.contents:
            if isinstance(tag, Tag):
                if tag.name == tagname:
                    found = tag
                    break
        
        return found
   
class BeautifulSupe(object):

    def __init__(self, data):
        #please don't give us your null terminators
        data = data.strip(chr(0))
        self.dom = xml.dom.minidom.parseString(data)
    
    def first(self, tagname, root = None):
        found = None
        if root == None:
            e = self.dom.getElementsByTagName(tagname)
            if len(e) > 0:
                found = e[0]
        else:
            for node in root.childNodes:
                if node.nodeName == tagname:
                    found = node
                    break

        if not found:
            return None

        tag = Tag(found)
        return tag

    def fetch(self, tagname, restraints = {}):
        e = self.dom.getElementsByTagName(tagname)

        matches = []

        for node in e:
            match = 1
            
            for restraint in restraints:
                f = self.first(restraint, node)
                if not f:
                    match = 0
                    break
                text = restraints[restraint]
                if not f.contents[0].startswith(text):
                    match = 0
                    break
                
            if match:
                tag = Tag(node)
                matches.append(tag)

        return matches


    def scour(self, prefix, suffix = None, node = None):
        if node is None:
            root = self.dom.getElementsByTagName(self.dom.documentElement.tagName)[0]
            node = root

        matches = []

        for node in node.childNodes:
            match = 0
            
            name = node.nodeName

            if name.startswith(prefix):
                if suffix:
                    if name.endswith(suffix):
                        match = 1
                else:
                    match = 1
                    
            if match:
                tag = Tag(node)
                matches.append(tag)

            matches += self.scour(prefix, suffix, node)

        return matches