summaryrefslogtreecommitdiffstats
path: root/wiki_to_nitrate_xml.py
diff options
context:
space:
mode:
Diffstat (limited to 'wiki_to_nitrate_xml.py')
-rwxr-xr-xwiki_to_nitrate_xml.py12
1 files changed, 10 insertions, 2 deletions
diff --git a/wiki_to_nitrate_xml.py b/wiki_to_nitrate_xml.py
index c962620..b9d36d1 100755
--- a/wiki_to_nitrate_xml.py
+++ b/wiki_to_nitrate_xml.py
@@ -7,6 +7,7 @@ import re
import time, datetime
import xml.etree.ElementTree as ET
import xml.dom.minidom
+from lxml import html
from BeautifulSoup import BeautifulSoup
try:
@@ -101,14 +102,21 @@ def list_categorymembers(wiki, cat_page, limit=5):
return members
+def repl(link):
+ '''add full url addresses to the links which only have paths.'''
+ if link.startswith('/'):
+ link = 'http://fedoraproject.org' + link
+ return link
+
def extract_to_dict(string, titles):
'''extract wiki contents in html format and cache to table'''
- s_text = string.get('text',{}).get('*','')
s_tag = string.get('categories',{})
tag = []
for t in s_tag:
tag.append(t.get('*',''))
- soup = BeautifulSoup(''.join(s_text))
+ s_text = string.get('text',{}).get('*','')
+ s_text_polished = html.rewrite_links(s_text, repl)
+ soup = BeautifulSoup(''.join(s_text_polished))
table = {}
table['title'] = titles
if soup.find(id='Description') == None: