summaryrefslogtreecommitdiffstats
path: root/BitTorrent/BeautifulSupe.py
diff options
context:
space:
mode:
Diffstat (limited to 'BitTorrent/BeautifulSupe.py')
-rw-r--r--BitTorrent/BeautifulSupe.py132
1 files changed, 132 insertions, 0 deletions
diff --git a/BitTorrent/BeautifulSupe.py b/BitTorrent/BeautifulSupe.py
new file mode 100644
index 0000000..79072d4
--- /dev/null
+++ b/BitTorrent/BeautifulSupe.py
@@ -0,0 +1,132 @@
+# A very very minimal BeautifulSoup immitation.
+#
+# BS uses SGMLlib to parse, which converts everything to lower case.
+# This uses real xml parsing to mimic the parts of BS we use.
+
+import xml.dom.minidom
+
+def _getText(node):
+ nodelist = node.childNodes
+ rc = []
+ for node in nodelist:
+ if node.nodeType == node.TEXT_NODE:
+ rc.append(str(node.data))
+ return rc
+
+def _getNodesAsTags(root):
+ nodelist = root.childNodes
+ tags = []
+ for node in nodelist:
+ if node.nodeType == node.ELEMENT_NODE:
+ tags.append(Tag(node))
+ return tags
+
+class Tag(object):
+ def __init__(self, node):
+ self.node = node
+ self.name = node.nodeName
+ self.contents = _getNodesAsTags(self.node)
+ text = _getText(self.node)
+ self.contents += text
+ self.text = ''.join(text)
+
+ def child_elements(self):
+ children = []
+ for tag in self.contents:
+ if isinstance(tag, Tag):
+ children.append(tag)
+ return children
+
+ def get(self, tagname):
+ got = self.first(tagname)
+ if got:
+ return got.text
+
+ def first(self, tagname):
+ found = None
+
+ for tag in self.contents:
+ if isinstance(tag, Tag):
+ if tag.name == tagname:
+ found = tag
+ break
+
+ return found
+
+class BeautifulSupe(object):
+
+ def __init__(self, data):
+ #please don't give us your null terminators
+ data = data.strip(chr(0))
+ self.dom = xml.dom.minidom.parseString(data)
+
+ def first(self, tagname, root = None):
+ found = None
+ if root == None:
+ e = self.dom.getElementsByTagName(tagname)
+ if len(e) > 0:
+ found = e[0]
+ else:
+ for node in root.childNodes:
+ if node.nodeName == tagname:
+ found = node
+ break
+
+ if not found:
+ return None
+
+ tag = Tag(found)
+ return tag
+
+ def fetch(self, tagname, restraints = {}):
+ e = self.dom.getElementsByTagName(tagname)
+
+ matches = []
+
+ for node in e:
+ match = 1
+
+ for restraint in restraints:
+ f = self.first(restraint, node)
+ if not f:
+ match = 0
+ break
+ text = restraints[restraint]
+ if not f.contents[0].startswith(text):
+ match = 0
+ break
+
+ if match:
+ tag = Tag(node)
+ matches.append(tag)
+
+ return matches
+
+
+ def scour(self, prefix, suffix = None, node = None):
+ if node is None:
+ root = self.dom.getElementsByTagName(self.dom.documentElement.tagName)[0]
+ node = root
+
+ matches = []
+
+ for node in node.childNodes:
+ match = 0
+
+ name = node.nodeName
+
+ if name.startswith(prefix):
+ if suffix:
+ if name.endswith(suffix):
+ match = 1
+ else:
+ match = 1
+
+ if match:
+ tag = Tag(node)
+ matches.append(tag)
+
+ matches += self.scour(prefix, suffix, node)
+
+ return matches
+