diff options
Diffstat (limited to 'wiki_to_nitrate_xml.py')
-rwxr-xr-x | wiki_to_nitrate_xml.py | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/wiki_to_nitrate_xml.py b/wiki_to_nitrate_xml.py index c962620..b9d36d1 100755 --- a/wiki_to_nitrate_xml.py +++ b/wiki_to_nitrate_xml.py @@ -7,6 +7,7 @@ import re import time, datetime import xml.etree.ElementTree as ET import xml.dom.minidom +from lxml import html from BeautifulSoup import BeautifulSoup try: @@ -101,14 +102,21 @@ def list_categorymembers(wiki, cat_page, limit=5): return members +def repl(link): + '''add full url addresses to the links which only have paths.''' + if link.startswith('/'): + link = 'http://fedoraproject.org' + link + return link + def extract_to_dict(string, titles): '''extract wiki contents in html format and cache to table''' - s_text = string.get('text',{}).get('*','') s_tag = string.get('categories',{}) tag = [] for t in s_tag: tag.append(t.get('*','')) - soup = BeautifulSoup(''.join(s_text)) + s_text = string.get('text',{}).get('*','') + s_text_polished = html.rewrite_links(s_text, repl) + soup = BeautifulSoup(''.join(s_text_polished)) table = {} table['title'] = titles if soup.find(id='Description') == None: |