1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
# A very very minimal BeautifulSoup immitation.
#
# BS uses SGMLlib to parse, which converts everything to lower case.
# This uses real xml parsing to mimic the parts of BS we use.
import xml.dom.minidom
def _getText(node):
nodelist = node.childNodes
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(str(node.data))
return rc
def _getNodesAsTags(root):
nodelist = root.childNodes
tags = []
for node in nodelist:
if node.nodeType == node.ELEMENT_NODE:
tags.append(Tag(node))
return tags
class Tag(object):
def __init__(self, node):
self.node = node
self.name = node.nodeName
self.contents = _getNodesAsTags(self.node)
text = _getText(self.node)
self.contents += text
self.text = ''.join(text)
def child_elements(self):
children = []
for tag in self.contents:
if isinstance(tag, Tag):
children.append(tag)
return children
def get(self, tagname):
got = self.first(tagname)
if got:
return got.text
def first(self, tagname):
found = None
for tag in self.contents:
if isinstance(tag, Tag):
if tag.name == tagname:
found = tag
break
return found
class BeautifulSupe(object):
def __init__(self, data):
#please don't give us your null terminators
data = data.strip(chr(0))
self.dom = xml.dom.minidom.parseString(data)
def first(self, tagname, root = None):
found = None
if root == None:
e = self.dom.getElementsByTagName(tagname)
if len(e) > 0:
found = e[0]
else:
for node in root.childNodes:
if node.nodeName == tagname:
found = node
break
if not found:
return None
tag = Tag(found)
return tag
def fetch(self, tagname, restraints = {}):
e = self.dom.getElementsByTagName(tagname)
matches = []
for node in e:
match = 1
for restraint in restraints:
f = self.first(restraint, node)
if not f:
match = 0
break
text = restraints[restraint]
if not f.contents[0].startswith(text):
match = 0
break
if match:
tag = Tag(node)
matches.append(tag)
return matches
def scour(self, prefix, suffix = None, node = None):
if node is None:
root = self.dom.getElementsByTagName(self.dom.documentElement.tagName)[0]
node = root
matches = []
for node in node.childNodes:
match = 0
name = node.nodeName
if name.startswith(prefix):
if suffix:
if name.endswith(suffix):
match = 1
else:
match = 1
if match:
tag = Tag(node)
matches.append(tag)
matches += self.scour(prefix, suffix, node)
return matches
|